Skip to content

Commit

Permalink
Merge pull request #1541 from UUDigitalHumanitieslab/feature/corpus-r…
Browse files Browse the repository at this point in the history
…eaders

Feature/corpus readers
  • Loading branch information
lukavdplas authored Apr 23, 2024
2 parents c234b81 + 30b86a7 commit e01f94b
Show file tree
Hide file tree
Showing 22 changed files with 188 additions and 71 deletions.
1 change: 1 addition & 0 deletions backend/addcorpus/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class CorpusConfigurationAdmin(admin.ModelAdmin):
'Source data extraction',
{
'fields': [
'data_directory',
'source_data_delimiter',
]
}
Expand Down
11 changes: 0 additions & 11 deletions backend/addcorpus/json_corpora/conftest.py

This file was deleted.

74 changes: 41 additions & 33 deletions backend/addcorpus/json_corpora/import_json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Iterable
from typing import Dict, Iterable, Optional
from datetime import datetime


Expand All @@ -15,10 +15,12 @@ def import_json_corpus(data: Dict) -> Corpus:

corpus, _created = Corpus.objects.get_or_create(name=name)

configuration = _parse_configuration(data)
configuration.corpus = corpus
configuration.full_clean()
# create a clean CorpusConfiguration object, but use the existing PK if possible
pk = corpus.configuration_obj.pk if corpus.configuration_obj else None
configuration = CorpusConfiguration(pk=pk, corpus=corpus)
configuration = _parse_configuration(data, configuration)
configuration.save()
configuration.full_clean()

_import_fields(data, configuration)

Expand All @@ -32,31 +34,25 @@ def create_index_name(corpus_name: str) -> str:
return corpus_name


def _parse_configuration(data: Dict) -> CorpusConfiguration:
title = get_path(data, 'meta', 'title')
description = get_path(data, 'meta', 'description')
category = get_path(data, 'meta', 'category')
es_index = create_index_name(get_path(data, 'name'))
languages = get_path(data, 'meta', 'languages')
min_date = _parse_date(get_path(data, 'meta', 'date_range', 'min'))
max_date = _parse_date(get_path(data, 'meta', 'date_range', 'max'))
default_sort = get_path(data, 'options', 'default_sort') or {}
language_field = get_path(data, 'options', 'language_field') or ''
document_context = get_path(data, 'options', 'document_context') or {}
delimiter = get_path(data, 'source_data', 'options', 'delimiter') or ','
return CorpusConfiguration(
title=title,
description=description,
category=category,
es_index=es_index,
languages=languages,
min_date=min_date,
max_date=max_date,
default_sort=default_sort,
language_field=language_field,
document_context=document_context,
source_data_delimiter=delimiter,
)
def _parse_configuration(data: Dict, configuration: CorpusConfiguration) -> CorpusConfiguration:
configuration.title = get_path(data, 'meta', 'title')
configuration.description = get_path(data, 'meta', 'description')
configuration.category = get_path(data, 'meta', 'category')
configuration.es_index = create_index_name(get_path(data, 'name'))
configuration.languages = get_path(data, 'meta', 'languages')
configuration.min_date = _parse_date(
get_path(data, 'meta', 'date_range', 'min'))
configuration.max_date = _parse_date(
get_path(data, 'meta', 'date_range', 'max'))
configuration.default_sort = get_path(
data, 'options', 'default_sort') or {}
configuration.language_field = get_path(
data, 'options', 'language_field') or ''
configuration.document_context = get_path(
data, 'options', 'document_context') or {}
configuration.source_data_delimiter = get_path(
data, 'source_data', 'options', 'delimiter') or ','
return configuration


def _parse_date(date: str):
Expand All @@ -67,15 +63,24 @@ def _import_fields(data: Dict, configuration: CorpusConfiguration) -> None:
fields_data = get_path(data, 'fields')

for field_data in fields_data:
field = _parse_field(field_data)
field.corpus_configuration = configuration
field.full_clean()
field = _parse_field(field_data, configuration)
field.save()
field.full_clean()

for field in configuration.fields.exclude(name__in=(f['name'] for f in fields_data)):
field.delete()

_include_ngram_visualisation(configuration.fields.all())


def _parse_field(field_data: Dict) -> Field:
def _field_pk(name: str, configuration: CorpusConfiguration):
try:
return Field.objects.get(corpus_configuration=configuration, name=name).pk
except Field.DoesNotExist:
return None


def _parse_field(field_data: Dict, configuration: Optional[CorpusConfiguration] = None) -> Field:
name = get_path(field_data, 'name')
display_name = get_path(field_data, 'display_name')
description = get_path(field_data, 'description')
Expand All @@ -84,6 +89,8 @@ def _parse_field(field_data: Dict) -> Field:
extract_column = get_path(field_data, 'extract', 'column')

field = Field(
pk=_field_pk(name, configuration) if configuration else None,
corpus_configuration=configuration,
name=name,
display_name=display_name,
description=description,
Expand All @@ -109,6 +116,7 @@ def _parse_field(field_data: Dict) -> Field:
return field



def _parse_text_content_field(field: Field, field_data: Dict) -> Field:
language = _parse_language(field_data)
has_single_language = language and language != 'dynamic'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.2.10 on 2024-04-16 12:11

import addcorpus.validation.creation
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0020_simplify_extraction_data'),
]

operations = [
migrations.AddField(
model_name='corpusconfiguration',
name='data_directory',
field=models.CharField(blank=True, help_text='path to directory containing source data files', max_length=200, validators=[addcorpus.validation.creation.validate_source_data_directory]),
),
]
10 changes: 9 additions & 1 deletion backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
validate_name_is_not_a_route_parameter, validate_search_filter,
validate_search_filter_with_mapping,
validate_searchable_field_has_full_text_search,
validate_sort_configuration, validate_visualizations_with_mapping)
validate_sort_configuration, validate_visualizations_with_mapping,
validate_source_data_directory,
)
from addcorpus.validation.indexing import (validate_essential_fields,
validate_has_configuration,
validate_language_field)
Expand Down Expand Up @@ -219,6 +221,12 @@ class CorpusConfiguration(models.Model):
help_text='name of the field that specifies the language of documents (if any);'
'required to use "dynamic" language on fields',
)
data_directory = models.CharField(
max_length=200,
validators=[validate_source_data_directory],
blank=True,
help_text='path to directory containing source data files',
)
source_data_delimiter = models.CharField(
max_length=1,
choices=[
Expand Down
11 changes: 7 additions & 4 deletions backend/addcorpus/python_corpora/load_corpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from django.conf import settings
import re
from os.path import abspath, dirname
from importlib import util
import logging
import re
import sys
from importlib import util
from os.path import abspath, dirname

from addcorpus.python_corpora.corpus import CorpusDefinition
from django.conf import settings

logger = logging.getLogger(__name__)
from addcorpus.python_corpora.corpus import CorpusDefinition

Expand Down
2 changes: 1 addition & 1 deletion backend/addcorpus/python_corpora/save_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration:
'word_models_present',
'default_sort',
'language_field',
'data_directory',
]

try:
Expand All @@ -82,7 +83,6 @@ def _field_pk(name: str, configuration: CorpusConfiguration):
return Field.objects.get(corpus_configuration=configuration, name=name).pk
except Field.DoesNotExist:
return None
return field.pk

def _save_field_in_database(field_definition: FieldDefinition, configuration: CorpusConfiguration):
attributes_to_copy = [
Expand Down
9 changes: 6 additions & 3 deletions backend/addcorpus/python_corpora/tests/test_save_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from django.conf import settings
from addcorpus.tests.mock_csv_corpus import MockCSVCorpus
from addcorpus.models import Corpus, CorpusConfiguration
from addcorpus.python_corpora.save_corpus import _save_field_in_database, \
from addcorpus.python_corpora.save_corpus import (_save_field_in_database,
load_and_save_all_corpora, _save_or_skip_corpus
)


def test_saved_corpora(db):
Expand All @@ -21,8 +22,10 @@ def test_saved_corpora(db):
assert corpus.configuration_obj
assert corpus.active

assert len(Corpus.objects.all()) == len(configured)
assert len(CorpusConfiguration.objects.all()) == len(configured)
assert Corpus.objects.filter(
has_python_definition=True).count() == len(configured)
assert CorpusConfiguration.objects.filter(
corpus__has_python_definition=True).count() == len(configured)

def test_no_errors_when_saving_corpora(db, capsys):
# try running the save function
Expand Down
39 changes: 39 additions & 0 deletions backend/addcorpus/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import glob

from addcorpus.models import Corpus, Field
from addcorpus.python_corpora.load_corpus import load_corpus_definition
from ianalyzer_readers.extract import CSV
from ianalyzer_readers.readers.core import Field as ReaderField
from ianalyzer_readers.readers.core import Reader
from ianalyzer_readers.readers.csv import CSVReader


def make_reader_field(corpus_field: Field) -> ReaderField:
return ReaderField(
name=corpus_field.name,
extractor=CSV(corpus_field.extract_column),
)


def make_reader(corpus: Corpus) -> Reader:
'''
From a corpus, returns a Reader object that allows source extraction
For Python corpora, simply loads the definition class,
for JSON based corpora, construct Reader from the database.
'''
if corpus.has_python_definition:
return load_corpus_definition(corpus.name)

class NewReader(CSVReader):
data_directory = corpus.configuration.data_directory
delimiter = corpus.configuration.source_data_delimiter
fields = [make_reader_field(f)
for f in corpus.configuration.fields.all()]

def sources(self, *args, **kwargs):
return (
(fn, {}) for fn in glob.glob(f'{self.data_directory}/**/*.csv', recursive=True)
)

return NewReader()
2 changes: 1 addition & 1 deletion backend/addcorpus/tests/mock_csv_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class MockCSVCorpus(CSVCorpusDefinition):
languages = ['en']
category = 'book'

def sources(self, start, end):
def sources(self, **kwargs):
for filename in os.listdir(self.data_directory):
full_path = os.path.join(self.data_directory, filename)
yield full_path, {
Expand Down
2 changes: 1 addition & 1 deletion backend/addcorpus/tests/test_csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
def test_csv():
corpus = MockCSVCorpus()

sources = list(corpus.sources(corpus.min_date, corpus.max_date))
sources = list(corpus.sources(start=corpus.min_date, end=corpus.max_date))
assert len(sources) == 1 and sources[0][1] == {'filename': 'example.csv'}

docs = corpus.source2dicts(sources[0])
Expand Down
30 changes: 30 additions & 0 deletions backend/addcorpus/tests/test_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
from addcorpus.models import Corpus
from addcorpus.reader import make_reader

HERE = os.path.abspath(os.path.dirname(__file__))


def test_make_reader_python(mock_corpus):
corpus = Corpus.objects.get(name=mock_corpus)
reader = make_reader(corpus)
docs = list(reader.documents())
# The number of lines differs because of different corpus configuration
assert len(docs) == 7
assert docs[0] == {
'character': 'HAMLET',
'lines': ["Whither wilt thou lead me? Speak, I\'ll go no further."]
}


def test_make_reader_json(json_mock_corpus):
json_mock_corpus.configuration.data_directory = os.path.join(HERE, 'csv_example')
json_mock_corpus.configuration.save()
reader = make_reader(json_mock_corpus)
docs = list(reader.documents())
# The number of lines differs because of different corpus configuration
assert len(docs) == 10
assert docs[0] == {
'character': 'HAMLET',
'line': "Whither wilt thou lead me? Speak, I\'ll go no further."
}
4 changes: 4 additions & 0 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,7 @@ def validate_sort_configuration(sort_config):

if type(ascending) is not bool:
raise ValidationError(f'Sort configuration has invalid "ascending" property: {ascending}')

def validate_source_data_directory(value):
if value and not os.path.isdir(value):
raise ValidationError(f'{value} is not a directory')
21 changes: 18 additions & 3 deletions backend/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import json
from time import sleep
import shutil
import os
import pytest
import requests
from allauth.account.models import EmailAddress

from addcorpus.json_corpora.import_json import import_json_corpus
from ianalyzer.elasticsearch import elasticsearch
from ianalyzer.settings_test import MEDIA_ROOT
from addcorpus.python_corpora.load_corpus import load_corpus_definition
from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora
from es import es_index as index
from django.conf import settings


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -99,12 +102,24 @@ def es_client():
return client

# mock corpora
@pytest.fixture(autouse=True)
def add_mock_python_corpora_to_db(db, media_dir):
# add python mock corpora to the database at the start of each test
load_and_save_all_corpora()


@pytest.fixture()
def json_corpus_data():
path = os.path.join(settings.BASE_DIR, 'corpora_test', 'mock_corpus.json')
with open(path) as f:
return json.load(f)


@pytest.fixture(autouse=True)
def add_mock_corpora_to_db(db, media_dir):
#add mock corpora to the database at the start of each test
load_and_save_all_corpora()
def json_mock_corpus(db, json_corpus_data):
# add json mock corpora to the database at the start of each test
return import_json_corpus(json_corpus_data)


def index_test_corpus(es_client, corpus_name):
corpus = load_corpus_definition(corpus_name)
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/parliament/parliament.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class Parliament(CorpusDefinition):
min_date = field_defaults.MIN_DATE
max_date = field_defaults.MAX_DATE
image = 'parliament.jpeg'
data_directory = 'bogus'
wordmodels_page = 'documentation.md'

category = 'parliament'
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/peaceportal/peaceportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class PeacePortal(ParentCorpusDefinition):
# fields below are required by code but not actually used
min_date = datetime.datetime(year=746, month=1, day=1)
category = 'inscription'
data_directory = 'bogus'

# Data overrides from .common.XMLCorpus
tag_entry = 'TEI'
Expand Down
File renamed without changes.
Loading

0 comments on commit e01f94b

Please sign in to comment.