-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1541 from UUDigitalHumanitieslab/feature/corpus-r…
…eaders Feature/corpus readers
- Loading branch information
Showing
22 changed files
with
188 additions
and
71 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
backend/addcorpus/migrations/0021_corpusconfiguration_data_directory.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Generated by Django 4.2.10 on 2024-04-16 12:11 | ||
|
||
import addcorpus.validation.creation | ||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('addcorpus', '0020_simplify_extraction_data'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='corpusconfiguration', | ||
name='data_directory', | ||
field=models.CharField(blank=True, help_text='path to directory containing source data files', max_length=200, validators=[addcorpus.validation.creation.validate_source_data_directory]), | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import glob | ||
|
||
from addcorpus.models import Corpus, Field | ||
from addcorpus.python_corpora.load_corpus import load_corpus_definition | ||
from ianalyzer_readers.extract import CSV | ||
from ianalyzer_readers.readers.core import Field as ReaderField | ||
from ianalyzer_readers.readers.core import Reader | ||
from ianalyzer_readers.readers.csv import CSVReader | ||
|
||
|
||
def make_reader_field(corpus_field: Field) -> ReaderField: | ||
return ReaderField( | ||
name=corpus_field.name, | ||
extractor=CSV(corpus_field.extract_column), | ||
) | ||
|
||
|
||
def make_reader(corpus: Corpus) -> Reader: | ||
''' | ||
From a corpus, returns a Reader object that allows source extraction | ||
For Python corpora, simply loads the definition class, | ||
for JSON based corpora, construct Reader from the database. | ||
''' | ||
if corpus.has_python_definition: | ||
return load_corpus_definition(corpus.name) | ||
|
||
class NewReader(CSVReader): | ||
data_directory = corpus.configuration.data_directory | ||
delimiter = corpus.configuration.source_data_delimiter | ||
fields = [make_reader_field(f) | ||
for f in corpus.configuration.fields.all()] | ||
|
||
def sources(self, *args, **kwargs): | ||
return ( | ||
(fn, {}) for fn in glob.glob(f'{self.data_directory}/**/*.csv', recursive=True) | ||
) | ||
|
||
return NewReader() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
from addcorpus.models import Corpus | ||
from addcorpus.reader import make_reader | ||
|
||
HERE = os.path.abspath(os.path.dirname(__file__)) | ||
|
||
|
||
def test_make_reader_python(mock_corpus): | ||
corpus = Corpus.objects.get(name=mock_corpus) | ||
reader = make_reader(corpus) | ||
docs = list(reader.documents()) | ||
# The number of lines differs because of different corpus configuration | ||
assert len(docs) == 7 | ||
assert docs[0] == { | ||
'character': 'HAMLET', | ||
'lines': ["Whither wilt thou lead me? Speak, I\'ll go no further."] | ||
} | ||
|
||
|
||
def test_make_reader_json(json_mock_corpus): | ||
json_mock_corpus.configuration.data_directory = os.path.join(HERE, 'csv_example') | ||
json_mock_corpus.configuration.save() | ||
reader = make_reader(json_mock_corpus) | ||
docs = list(reader.documents()) | ||
# The number of lines differs because of different corpus configuration | ||
assert len(docs) == 10 | ||
assert docs[0] == { | ||
'character': 'HAMLET', | ||
'line': "Whither wilt thou lead me? Speak, I\'ll go no further." | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.