Skip to content

Commit

Permalink
Merge pull request #1720 from CentreForDigitalHumanities/feature/peac…
Browse files Browse the repository at this point in the history
…eportal-sourcedatabase

Remove source database filter limit
  • Loading branch information
JeltevanBoheemen authored Dec 6, 2024
2 parents 2c8093b + d9f8863 commit 9a3810b
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 7 deletions.
170 changes: 170 additions & 0 deletions backend/corpora/peaceportal/northafrica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@

import datetime
import os
import re
from typing import List, Optional, Tuple, Union

from addcorpus.es_mappings import main_content_mapping, text_mapping
from addcorpus.python_corpora.corpus import (FieldDefinition,
XLSXCorpusDefinition)
from addcorpus.python_corpora.extract import CSV, Constant
from corpora.peaceportal.peaceportal import PeacePortal, transform_to_date_range
from corpora.utils.exclude_fields import exclude_fields_without_extractor
from django.conf import settings


def convert_sex(value: str) -> List[str]:
convert_table = {
'?': 'Unknown',
'X': 'Unknown',
'M': 'M',
'F': 'F',
'M?': 'Unknown'
}
sexes = value.strip().split(';')
return [convert_table[v.strip()] for v in sexes]


def convert_names(value) -> List[str]:
if not value:
return []
return [name.strip() for name in value.split(';')]


def convert_ages(value: Union[str, int]) -> List[int]:
if isinstance(value, int):
return value
split_ages = value.strip().split(';')
result = []
for age_string in split_ages:
try:
result.append(int(age_string.strip()))
except ValueError:
pass
return result


def convert_languages(value: str) -> List[str]:
expected_languages = ['greek', 'latin', 'hebrew', 'semitic']
languages = [lang.strip().replace('(', '').replace(')', '')
for lang in value.strip().split(';')]
return [lang if lang.lower() in expected_languages else 'Unknown' for lang in languages]


def convert_language_codes(value: str) -> List[str]:
codes = {
'Greek': 'el',
'Latin': 'la',
'Hebrew': 'he',
'Semitic': 'sem'
}
return [codes.get(lang, 'Unknown') for lang in convert_languages(value)]


def convert_dating(value: str) -> Optional[Tuple[int, int]]:
'''Extract a date range if possible, otherwise use global minimum and maximum'''
dating_pattern = r'(\d+)\-(\d+)'
match = re.match(dating_pattern, value)
if match:
date_range = transform_to_date_range(datetime.date(year=int(match.group(1)), month=1, day=1),
datetime.date(year=int(match.group(2)), month=12, day=31))
return date_range
return transform_to_date_range(None, None) # Use defaults


def format_comments(values: Tuple):
'''Format the comment field. Includes:
- Remarks about age
'''
return f'Remarks about age:\t{values[0]}\n'


def convert_none(value: str) -> str:
if value == 'None':
return None
return value


class PeaceportalNorthAfrica(PeacePortal, XLSXCorpusDefinition):
data_directory = settings.PEACEPORTAL_NORTHAFRICA_DATA
es_index = getattr(
settings, 'PEACEPORTAL_NORTHAFRICA_ES_INDEX', 'peaceportal-northafrica')
title = 'Jewish Epitaphs from North Africa-Carthage'

def __init__(self):
super().__init__()
self.source_database.extractor = Constant(
value='Jewish Epitaphs from North Africa-Carthage'
)

self._id.extractor = CSV(
'ID', transform=lambda id: f'NorthAfrica_{id}')
self.transcription.extractor = CSV('Inscription')

# Combine not before and not after information into date range
self.date.extractor = CSV('Dating', transform=convert_dating)

self.names.extractor = CSV('Names ', transform=convert_names)
self.sex.extractor = CSV('Sex', transform=convert_sex)
self.age.extractor = CSV('Age', transform=convert_ages)

remarks_about_age = FieldDefinition(
name='age_remarks',
display_name='Remarks about age',
description='Remarks about the age of the buried person(s)',
es_mapping=text_mapping(),
extractor=CSV('Remarks about Age', transform=convert_none),
)
self.fields.append(remarks_about_age)
self.country.extractor = CSV('Country')
self.settlement.extractor = CSV('Settlement')
self.location_details.extractor = CSV('Location Details')
self.language.extractor = CSV(
'Language', transform=convert_languages)
self.language_code.extractor = CSV(
'Language', transform=convert_language_codes)
self.language_code.hidden = True
self.iconography.extractor = CSV('Iconography', transform=convert_none)
self.material.extractor = CSV('Material')
self.bibliography.extractor = CSV('Bibliography')

# The transcription is the secondary content field
self.transcription_english = FieldDefinition(
name='transcription_en',
display_name='Translation',
description='English translation of this inscription.',
es_mapping=main_content_mapping(
stopword_analysis=True, stemming_analysis=True, language='en'),
language='en',
results_overview=True,
search_field_core=True,
display_type='text_content',
extractor=CSV('Translation'),
)

# The commentary is the tertiary content field
self.comments = FieldDefinition(
name='comments',
display_name='Commentary',
description='Extra comments, questions or remarks on this inscription.',
es_mapping=main_content_mapping(
stopword_analysis=True, stemming_analysis=True, language='en'),
language='en',
search_field_core=True,
display_type='text_content',
extractor=CSV('Commentary', transform=convert_none)
)

# Remove fields without an extractor, aka fields in parent corpus that are missing
self.fields = exclude_fields_without_extractor(self.fields)

# Ensure the correct order of content fields
# Translation and commentary should be behind transcription, in correct order
self.fields = list(filter(lambda x: x.name not in [
'transcription_en', 'comments'], self.fields))
self.fields += [self.transcription_english, self.comments]

def sources(self, *args, **kwargs):
path = os.path.join(self.data_directory,
'Jewish Epitaphs from North Africa-Carthage.xlsx')
yield path
10 changes: 9 additions & 1 deletion backend/corpora/peaceportal/peaceportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from addcorpus.es_settings import es_settings
from corpora.peaceportal.utils import field_defaults


class PeacePortal(ParentCorpusDefinition):
'''
Base class for corpora in the PEACE portal.
Expand All @@ -32,7 +33,7 @@ class PeacePortal(ParentCorpusDefinition):
es_alias = getattr(settings, 'PEACEPORTAL_ALIAS', 'peaceportal')
scan_image_type = 'image/png'
# fields below are required by code but not actually used
min_date = datetime.datetime(year=746, month=1, day=1)
min_date = datetime.datetime(year=1, month=1, day=1)
category = 'inscription'

tag_entry = Tag('TEI')
Expand All @@ -42,6 +43,9 @@ class PeacePortal(ParentCorpusDefinition):
# el stands for modern Greek (1500-)
languages = ['en', 'de', 'nl', 'he', 'la', 'el']

# placeholder data directory
data_directory = ''

@property
def es_settings(self):
return es_settings(self.languages, stopword_analysis=True,
Expand Down Expand Up @@ -286,8 +290,12 @@ def zero_pad_year(input):
def transform_to_date_range(earliest, latest):
if not earliest:
earliest = PeacePortal.min_date
if isinstance(earliest, datetime.datetime):
earliest = earliest.date()
if not latest:
latest = PeacePortal.max_date
if isinstance(latest, datetime.datetime):
latest = latest.date()
return {
'gte': earliest,
'lte': latest
Expand Down
11 changes: 5 additions & 6 deletions backend/corpora/peaceportal/utils/field_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def source_database():
es_mapping=keyword_mapping(),
search_filter=MultipleChoiceFilter(
description='Search only within these databases.',
option_count=4,
),
csv_core=True
)
Expand Down Expand Up @@ -208,7 +207,7 @@ def country():
description="Country where the inscription was found.",
es_mapping=keyword_mapping(True),
search_filter=MultipleChoiceFilter(
description="Search only within these countries.", option_count=5
description="Search only within these countries.", option_count=50
),
visualizations=["resultscount"],
results_overview=True,
Expand All @@ -222,7 +221,7 @@ def settlement():
description="The settlement where the inscription was found.",
es_mapping=keyword_mapping(True),
search_filter=MultipleChoiceFilter(
description="Search only within these settlements.", option_count=29
description="Search only within these settlements.", option_count=50
),
visualizations=["resultscount"],
)
Expand All @@ -235,7 +234,7 @@ def region():
description="The region where the inscription was found.",
es_mapping=keyword_mapping(True),
search_filter=MultipleChoiceFilter(
description="Search only within these regions.", option_count=29
description="Search only within these regions.", option_count=50
),
visualizations=["resultscount"],
)
Expand All @@ -257,7 +256,7 @@ def material():
description="Type of material the inscription is written on.",
es_mapping=keyword_mapping(),
search_filter=MultipleChoiceFilter(
description="Search only within these material types.", option_count=39
description="Search only within these material types.", option_count=50
),
visualization_type="resultscount",
)
Expand All @@ -280,7 +279,7 @@ def language():
description="Language of the inscription.",
es_mapping=keyword_mapping(),
search_filter=MultipleChoiceFilter(
description="Search only within these languages.", option_count=10
description="Search only within these languages.", option_count=50
),
csv_core=True,
visualizations=["resultscount"],
Expand Down

0 comments on commit 9a3810b

Please sign in to comment.