Skip to content

Commit

Permalink
Merge branch 'develop' into feature/settings-documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas authored Apr 18, 2024
2 parents 777b9ac + 191ec50 commit 839b5ad
Show file tree
Hide file tree
Showing 46 changed files with 1,022 additions and 100 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Release

on:
workflow_dispatch:
push:
branches:
- 'release/**'
- 'hotfix/**'
jobs:
citation-update:
uses: actions/checkout@v3
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Update CITATION.cff
- run: |
version=`grep -o '\d\+\.\d\+\.\d\+' package.json`
today=`date +"%Y-%m-%d"`
sed -i "s/^version: [[:digit:]]\{1,\}\.[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}/version: $version/" CITATION.cff
sed -i "s/[[:digit:]]\{4\}-[[:digit:]]\{2\}-[[:digit:]]\{2\}/$today/" CITATION.cff
git commit -a -m "update version and date in CITATION.cff"
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ keywords:
- elasticsearch
- natural language processing
license: MIT
version: 5.5.1
date-released: '2024-03-21'
version: 5.6.0
date-released: '2024-04-15'
19 changes: 17 additions & 2 deletions backend/addcorpus/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def show_warning_message(request):


class CorpusAdmin(admin.ModelAdmin):
readonly_fields = ['name', 'configuration', 'has_python_definition', 'ready_to_index', 'ready_to_publish']
readonly_fields = ['configuration', 'ready_to_index', 'ready_to_publish']
fields = ['name', 'groups', 'configuration', 'has_python_definition', 'ready_to_index', 'ready_to_publish', 'active']
list_display = ['name', 'active']
list_filter = ['groups', 'active']
Expand Down Expand Up @@ -44,6 +44,13 @@ class CorpusConfigurationAdmin(admin.ModelAdmin):
'image',
]
}
), (
'Source data extraction',
{
'fields': [
'source_data_delimiter',
]
}
), (
'Content',
{
Expand Down Expand Up @@ -104,13 +111,21 @@ class FieldAdmin(admin.ModelAdmin):
]
}
),
(
'Source data extraction',
{
'fields': [
'extract_column',
'required',
]
}
),
(
'Indexing options',
{
'fields': [
'es_mapping',
'indexed',
'required',
]
}
), (
Expand Down
18 changes: 12 additions & 6 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from addcorpus.es_settings import add_language_string
from addcorpus.es_settings import add_language_string, stopwords_available, stemming_available

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):
'''
Mapping for the main content field. Options:
- `token_counts`: enables aggregations for the total number of words. Used for relative term frequencies.
- `stopword_analysis`: enables analysis using stopword removal.
- `stemming_analysis`: enables analysis using stemming.
- `stopword_analysis`: enables analysis using stopword removal, if available for the language.
- `stemming_analysis`: enables analysis using stemming, if available for the language.
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

Expand All @@ -26,13 +26,13 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
"type": "token_count",
"analyzer": "standard"
}
if stopword_analysis:
if stopword_analysis and stopwords_available(language):
multifields['clean'] = {
"type": "text",
"analyzer": add_language_string('clean', language),
"term_vector": "with_positions_offsets" # include character positions for highlighting
}
if stemming_analysis:
if stemming_analysis and stemming_available(language):
multifields['stemmed'] = {
"type": "text",
"analyzer": add_language_string('stemmed', language),
Expand Down Expand Up @@ -87,8 +87,14 @@ def int_mapping():
'type': 'integer'
}

def float_mapping():
return {
'type': 'float'
}


def bool_mapping():
return {'type': 'boolean'}

def geo_mapping():
return {'type': 'geo_point'}
return {'type': 'geo_point'}
62 changes: 43 additions & 19 deletions backend/addcorpus/es_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
import warnings

from django.conf import settings
from langcodes import Language
from langcodes import Language, standardize_tag
import nltk

# available Elasticsearch stemmers [https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html]
AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
AVAILABLE_ES_STEMMERS = ['arabic', 'armenian', 'basque', 'bengali', 'brazilian',
'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch',
'english', 'estonian', 'finnish', 'french', 'galician',
'german', 'greek', 'hindi', 'hungarian', 'indonesian',
'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
'persian', 'portuguese', 'romanian', 'russian', 'sorani',
'german', 'greek', 'hindi', 'hungarian', 'indonesian',
'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
'persian', 'portuguese', 'romanian', 'russian', 'sorani',
'spanish', 'swedish', 'turkish', 'thai']

def get_language_key(language_code):
Expand All @@ -21,28 +21,52 @@ def get_language_key(language_code):
E.g. 'en' -> 'english'
'''

return Language.make(language_code).display_name().lower()
return Language.make(standardize_tag(language_code)).display_name().lower()


def get_nltk_stopwords(language_code):
def _stopwords_directory() -> str:
stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords')
if not os.path.exists(stopwords_dir):
nltk.download('stopwords', settings.NLTK_DATA_PATH)
return stopwords_dir

languages = os.listdir(stopwords_dir)
def _stopwords_path(language_code: str):
dir = _stopwords_directory()
language = get_language_key(language_code)
return os.path.join(dir, language)

def stopwords_available(language_code: str) -> bool:
if not language_code:
return False
path = _stopwords_path(language_code)
return os.path.exists(path)

if language in languages:
filepath = os.path.join(stopwords_dir, language)
with open(filepath) as infile:
def get_nltk_stopwords(language_code):
path = _stopwords_path(language_code)

if os.path.exists(path):
with open(path) as infile:
words = [line.strip() for line in infile.readlines()]
return words
else:
raise NotImplementedError('language {} has no nltk stopwords list'.format(language))
raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code))

def add_language_string(name, language):
return '{}_{}'.format(name, language) if language else name

def stemming_available(language_code: str) -> bool:
'''
Check whether stemming is supported for a language.
Parameters:
language: an ISO-639 language code
Returns:
whether elasticsearch supports stemming analysis in this language.
'''
if not language_code:
return False
return get_language_key(language_code) in AVAILABLE_ES_STEMMERS

def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
'''
Make elasticsearch settings json for a corpus index. Options:
Expand All @@ -55,9 +79,9 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
clean_analyzer_name = 'clean'
stemmer_filter_name = 'stemmer'
stemmed_analyzer_name = 'stemmed'

set_char_filter(settings)

for language in languages:
# do not attach language isocodes if there is just one language

Expand All @@ -72,7 +96,7 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False):
add_language_string(clean_analyzer_name, language),
)
if stemming_analysis:
if not get_language_key(language) in AVAILABLE_ES_STEMMERS:
if not stemming_available(language):
warnings.warn('You specified `stemming_analysis=True`, but \
there is no stemmer available for this language')
continue
Expand Down Expand Up @@ -157,9 +181,9 @@ def set_stopword_filter(settings, stopword_filter_name, language):
})
settings['analysis']['filter'] = filters
return True

def set_clean_analyzer(settings, stopword_filter_name, clean_analyzer_name):
clean_analyzer = make_clean_analyzer(stopword_filter_name)
analyzers = settings['analysis'].get('analyzer', {})
analyzers.update({clean_analyzer_name: clean_analyzer})
settings["analysis"]['analyzer'] = analyzers
settings["analysis"]['analyzer'] = analyzers
Empty file.
11 changes: 11 additions & 0 deletions backend/addcorpus/json_corpora/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os
import json
import pytest

_here = os.path.abspath(os.path.dirname(__file__))

@pytest.fixture()
def json_corpus_data():
path = os.path.join(_here, 'tests', 'mock_corpus.json')
with open(path) as f:
return json.load(f)
Loading

0 comments on commit 839b5ad

Please sign in to comment.