Merge branch 'develop' into feature/settings-documentation

CentreForDigitalHumanities · Apr 18, 2024 · 839b5ad · 839b5ad
2 parents 777b9ac + 191ec50
commit 839b5ad
Show file tree

Hide file tree

Showing 46 changed files with 1,022 additions and 100 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -0,0 +1,22 @@
+name: Release
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'release/**'
+      - 'hotfix/**'
+  jobs:
+    citation-update:
+    uses: actions/checkout@v3
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Update CITATION.cff
+    - run: |
+      version=`grep -o '\d\+\.\d\+\.\d\+' package.json`
+      today=`date +"%Y-%m-%d"`
+      sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff
+      sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff
+      git commit -a -m "update version and date in CITATION.cff"
+
diff --git a/CITATION.cff b/CITATION.cff
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.5.1
-date-released: '2024-03-21'
+version: 5.6.0
+date-released: '2024-04-15'
diff --git a/backend/addcorpus/admin.py b/backend/addcorpus/admin.py
@@ -15,7 +15,7 @@ def show_warning_message(request):
 
 
 class CorpusAdmin(admin.ModelAdmin):
-    readonly_fields = ['name', 'configuration', 'has_python_definition', 'ready_to_index', 'ready_to_publish']
+    readonly_fields = ['configuration', 'ready_to_index', 'ready_to_publish']
     fields = ['name', 'groups', 'configuration', 'has_python_definition', 'ready_to_index', 'ready_to_publish', 'active']
     list_display = ['name', 'active']
     list_filter = ['groups', 'active']
@@ -44,6 +44,13 @@ class CorpusConfigurationAdmin(admin.ModelAdmin):
                     'image',
                 ]
             }
+        ), (
+            'Source data extraction',
+            {
+                'fields': [
+                    'source_data_delimiter',
+                ]
+            }
         ), (
             'Content',
             {
@@ -104,13 +111,21 @@ class FieldAdmin(admin.ModelAdmin):
                 ]
             }
         ),
+        (
+            'Source data extraction',
+            {
+                'fields': [
+                    'extract_column',
+                    'required',
+                ]
+            }
+        ),
         (
             'Indexing options',
             {
                 'fields': [
                     'es_mapping',
                     'indexed',
-                    'required',
                 ]
             }
         ), (

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,12 +1,12 @@
-from addcorpus.es_settings import add_language_string
+from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available
 
 def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:
 
     - `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
-    - `stopword_analysis`: enables analysis using stopword removal.
-    - `stemming_analysis`: enables analysis using stemming.
+    - `stopword_analysis`: enables analysis using stopword removal, if available for the language.
+    - `stemming_analysis`: enables analysis using stemming, if available for the language.
     - `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
     '''
 
@@ -26,13 +26,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
                 "type":     "token_count",
                 "analyzer": "standard"
             }
-        if stopword_analysis:
+        if stopword_analysis and stopwords_available(language):
             multifields['clean'] = {
                 "type": "text",
                 "analyzer": add_language_string('clean', language),
                 "term_vector": "with_positions_offsets" # include character positions for highlighting
             }
-        if stemming_analysis:
+        if stemming_analysis and stemming_available(language):
             multifields['stemmed'] = {
                 "type": "text",
                 "analyzer": add_language_string('stemmed', language),
@@ -87,8 +87,14 @@ def int_mapping():
         'type': 'integer'
     }
 
+def float_mapping():
+    return {
+        'type': 'float'
+    }
+
+
 def bool_mapping():
     return {'type': 'boolean'}
 
 def geo_mapping():
-    return {'type': 'geo_point'}
+    return {'type': 'geo_point'}
diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py
@@ -2,16 +2,16 @@
 import warnings
 
 from django.conf import settings
-from langcodes import Language
+from langcodes import Language, standardize_tag
 import nltk
 
 # available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
-AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian', 
-                         'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', 
+AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
+                         'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
                          'english', 'estonian', 'finnish', 'french', 'galician',
-                         'german', 'greek', 'hindi', 'hungarian', 'indonesian', 
-                         'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', 
-                         'persian', 'portuguese', 'romanian', 'russian', 'sorani', 
+                         'german', 'greek', 'hindi', 'hungarian', 'indonesian',
+                         'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
+                         'persian', 'portuguese', 'romanian', 'russian', 'sorani',
                          'spanish', 'swedish', 'turkish', 'thai']
 
 def get_language_key(language_code):
@@ -21,28 +21,52 @@ def get_language_key(language_code):
     E.g. 'en' -> 'english'
     '''
 
-    return Language.make(language_code).display_name().lower()
+    return Language.make(standardize_tag(language_code)).display_name().lower()
 
-
-def get_nltk_stopwords(language_code):
+def _stopwords_directory() -> str:
     stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
     if not os.path.exists(stopwords_dir):
         nltk.download('stopwords', settings.NLTK_DATA_PATH)
+    return stopwords_dir
 
-    languages = os.listdir(stopwords_dir)
+def _stopwords_path(language_code: str):
+    dir = _stopwords_directory()
     language = get_language_key(language_code)
+    return os.path.join(dir, language)
+
+def stopwords_available(language_code: str) -> bool:
+    if not language_code:
+        return False
+    path = _stopwords_path(language_code)
+    return os.path.exists(path)
 
-    if language in languages:
-        filepath = os.path.join(stopwords_dir, language)
-        with open(filepath) as infile:
+def get_nltk_stopwords(language_code):
+    path = _stopwords_path(language_code)
+
+    if os.path.exists(path):
+        with open(path) as infile:
             words = [line.strip() for line in infile.readlines()]
             return words
     else:
-        raise NotImplementedError('language {} has no nltk stopwords list'.format(language))
+        raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code))
 
 def add_language_string(name, language):
     return '{}_{}'.format(name, language) if language else name
 
+def stemming_available(language_code: str) -> bool:
+    '''
+    Check whether stemming is supported for a language.
+
+    Parameters:
+        language: an ISO-639 language code
+
+    Returns:
+        whether elasticsearch supports stemming analysis in this language.
+    '''
+    if not language_code:
+        return False
+    return get_language_key(language_code) in AVAILABLE_ES_STEMMERS
+
 def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
     '''
     Make elasticsearch settings json for a corpus index. Options:
@@ -55,9 +79,9 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
     clean_analyzer_name = 'clean'
     stemmer_filter_name = 'stemmer'
     stemmed_analyzer_name = 'stemmed'
-    
+
     set_char_filter(settings)
-    
+
     for language in languages:
         # do not attach language isocodes if there is just one language
 
@@ -72,7 +96,7 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
                     add_language_string(clean_analyzer_name, language),
                 )
             if stemming_analysis:
-                if not get_language_key(language) in AVAILABLE_ES_STEMMERS:
+                if not stemming_available(language):
                     warnings.warn('You specified `stemming_analysis=True`, but \
                                       there is no stemmer available for this language')
                     continue
@@ -157,9 +181,9 @@ def set_stopword_filter(settings, stopword_filter_name, language):
     })
     settings['analysis']['filter'] = filters
     return True
-    
+
 def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name):
     clean_analyzer = make_clean_analyzer(stopword_filter_name)
     analyzers = settings['analysis'].get('analyzer', {})
     analyzers.update({clean_analyzer_name: clean_analyzer})
-    settings["analysis"]['analyzer'] = analyzers
+    settings["analysis"]['analyzer'] = analyzers
diff --git a/backend/addcorpus/json_corpora/__init__.py b/backend/addcorpus/json_corpora/__init__.py
diff --git a/backend/addcorpus/json_corpora/conftest.py b/backend/addcorpus/json_corpora/conftest.py
@@ -0,0 +1,11 @@
+import os
+import json
+import pytest
+
+_here = os.path.abspath(os.path.dirname(__file__))
+
+@pytest.fixture()
+def json_corpus_data():
+    path = os.path.join(_here, 'tests', 'mock_corpus.json')
+    with open(path) as f:
+        return json.load(f)