Merge branch 'develop' into feature/full-ngram

CentreForDigitalHumanities · Oct 26, 2023 · 1369c43 · 1369c43
2 parents ed49010 + c1b99ae
commit 1369c43
Show file tree

Hide file tree

Showing 68 changed files with 870 additions and 146 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,36 @@
+---
+name: Bug report
+about: Let us know about something that isn't working right
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+### What went wrong?
+
+Describe what happened.
+
+### Expected behavior
+
+What did you expect to happen?
+
+### Screenshots
+
+If applicable, please add a screenshot of the problem! 
+
+### Which version?
+
+Please specify where you encountered the issue:
+
+- [ ] https://ianalyzer.hum.uu.nl
+- [ ] https://peopleandparliament.hum.uu.nl
+- [ ] https://peace.sites.uu.nl/
+- [ ] a server hosted elsewhere (i.e. not by the research software lab)
+- [ ] a local server
+
+If this happened on local or third-party server, it helps if you can be more specific about the version. Please include the version number (e.g. "3.2.4") or a commit hash if you know it!
+
+### To reproduce
+
+How can a developer replicate the issue? Please provide any information you can. For example: "I went to https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then clicked on *Download CSV*. I pressed *cancel* and then I clicked *Download CSV* again."
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for something new
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -1,4 +1,4 @@
-def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
+def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
     '''
     Mapping for the main content field. Options:
 

diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
@@ -8,7 +8,7 @@
 from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
     validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
     validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
-    validate_visualizations_with_mapping, validate_implication
+    validate_visualizations_with_mapping, validate_implication, any_date_fields, visualisations_require_date_field
 
 MAX_LENGTH_NAME = 126
 MAX_LENGTH_DESCRIPTION = 254
@@ -269,3 +269,9 @@ def clean(self):
             validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable")
         except ValidationError as e:
             warnings.warn(e.message)
+
+        validate_implication(
+            self.visualizations, self.corpus_configuration.fields.all(),
+            'The ngram visualisation requires a date field on the corpus',
+            visualisations_require_date_field, any_date_fields,
+        )
diff --git a/backend/addcorpus/tests/test_validators.py b/backend/addcorpus/tests/test_validators.py
@@ -1,5 +1,6 @@
 import pytest
-from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping
+from addcorpus.models import Field
+from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping, main_content_mapping, date_mapping
 from addcorpus.validators import *
 
 def test_validate_mimetype():
@@ -71,3 +72,32 @@ def test_filename_validation():
     with pytest.raises(ValidationError):
         validate_image_filename_extension('image.txt')
 
+def test_validate_ngram_has_date_field():
+    text_field = Field(
+        name='content',
+        es_mapping=main_content_mapping(),
+        visualizations=['wordcloud', 'ngram']
+    )
+
+    date_field = Field(
+        name='date',
+        es_mapping=date_mapping()
+    )
+
+    with_date_field = [text_field, date_field]
+    without_date_field = [text_field]
+
+    validate_implication(
+        text_field.visualizations, with_date_field,
+        '',
+        visualisations_require_date_field,
+        any_date_fields
+    )
+
+    with pytest.raises(ValidationError):
+        validate_implication(
+            text_field.visualizations, without_date_field,
+            '',
+            visualisations_require_date_field,
+            any_date_fields
+        )
diff --git a/backend/addcorpus/validators.py b/backend/addcorpus/validators.py
@@ -152,3 +152,10 @@ def validate_markdown_filename_extension(filename):
 def validate_image_filename_extension(filename):
     allowed = ['.jpeg', '.jpg', '.png', '.JPG']
     validate_filename_extension(filename, allowed)
+
+def any_date_fields(fields):
+    is_date = lambda field: primary_mapping_type(field.es_mapping) == 'date'
+    return any(map(is_date, fields))
+
+def visualisations_require_date_field(visualisations):
+    return visualisations and 'ngram' in visualisations
diff --git a/backend/api/tests/test_api_views.py b/backend/api/tests/test_api_views.py
@@ -2,19 +2,11 @@
 from addcorpus.models import Corpus
 from rest_framework.status import is_success
 
-def test_search_history_view(admin_user, admin_client):
-    corpus = Corpus.objects.create(name = 'mock-corpus')
-
-    # get search history
-    response = admin_client.get('/api/search_history/')
-    assert is_success(response.status_code)
-    assert len(response.data) == 0
-
-    # add a query to search history
-    data = {
+def mock_query_data(user, corpus_name):
+    return {
         'aborted': False,
-        'corpus': corpus.name,
-        'user': admin_user.id,
+        'corpus': corpus_name,
+        'user': user.id,
         'started': datetime.now().isoformat(),
         'completed': datetime.now().isoformat(),
         'query_json': {
@@ -25,6 +17,17 @@ def test_search_history_view(admin_user, admin_client):
         'total_results': 10,
         'transferred': 0,
     }
+
+def test_search_history_view(admin_user, admin_client):
+    corpus = Corpus.objects.create(name = 'mock-corpus')
+
+    # get search history
+    response = admin_client.get('/api/search_history/')
+    assert is_success(response.status_code)
+    assert len(response.data) == 0
+
+    # add a query to search history
+    data = mock_query_data(admin_user, 'mock-corpus')
     response = admin_client.post('/api/search_history/', data, content_type='application/json')
     assert is_success(response.status_code)
 
@@ -34,6 +37,20 @@ def test_search_history_view(admin_user, admin_client):
     assert len(response.data) == 1
 
 
+def test_delete_search_history(auth_client, auth_user, db):
+    mock_corpus = 'mock-corpus'
+    corpus = Corpus.objects.create(name = mock_corpus)
+    query = mock_query_data(auth_user, mock_corpus)
+    auth_client.post('/api/search_history/', query, content_type='application/json')
+
+    assert len(auth_user.queries.all()) == 1
+
+    response = auth_client.post('/api/search_history/delete_all/')
+    assert is_success(response.status_code)
+
+    assert len(auth_user.queries.all()) == 0
+
+
 def test_task_status_view(transactional_db, admin_client, celery_worker):
     bad_request = {
         'bad_key': 'data'

diff --git a/backend/api/views.py b/backend/api/views.py
@@ -4,8 +4,8 @@
 from api.serializers import QuerySerializer
 from rest_framework.permissions import IsAuthenticated
 from rest_framework.exceptions import APIException
+from rest_framework.decorators import action
 import logging
-from rest_framework.permissions import IsAuthenticated
 from api.utils import check_json_keys
 from celery import current_app as celery_app
 
@@ -23,6 +23,12 @@ class QueryViewset(viewsets.ModelViewSet):
     def get_queryset(self):
         return self.request.user.queries.all()
 
+    @action(detail=False, methods=['post'])
+    def delete_all(self, request):
+        queries = self.get_queryset()
+        queries.delete()
+        return Response('success')
+
 class TaskStatusView(APIView):
     '''
     Get the status of an array of backend tasks (working/done/failed),

diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py
@@ -362,7 +362,7 @@ def _xml_files(self):
             transform_soup_func=utils.pad_content,
         ),
         es_mapping=main_content_mapping(token_counts=True),
-        visualizations=['wordcloud', 'ngram'],
+        visualizations=['wordcloud'],
     )
 
     has_content = FieldDefinition(

diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -12,8 +12,8 @@
 from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
 from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
 from addcorpus.load_corpus import corpus_dir
-
 from addcorpus.es_mappings import keyword_mapping, main_content_mapping
+from addcorpus.es_settings import es_settings
 
 from media.media_url import media_url
 
@@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):
 
     dutchannualreports_map = {}
 
+    @property
+    def es_settings(self):
+        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
     with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
         reader = csv.DictReader(f)
         for line in reader:

diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
@@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
     description_page = 'ecco.md'
     min_date = datetime(year=1700, month=1, day=1)
     max_date = datetime(year=1800, month=12, day=31)
-
-    @property
-    def es_settings(self):
-        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
-
     data_directory = settings.ECCO_DATA
     es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
     image = 'ecco.jpg'
@@ -47,6 +42,10 @@ def es_settings(self):
 
     meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')
 
+    @property
+    def es_settings(self):
+        return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
     def sources(self, start=min_date, end=max_date):
         logging.basicConfig(filename='ecco.log', level=logging.INFO)
 

diff --git a/backend/corpora/parliament/finland-old.py b/backend/corpora/parliament/finland-old.py
@@ -14,7 +14,7 @@
 class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
     title = 'People and Parliament (Finland, 1863-1905)'
     description = 'Speeches from the early Finnish estates'
-    max_date = datetime(year=1905, month=12, day=31)
+    max_date = datetime(year=1906, month=12, day=31)
     min_date = datetime(year=1863, month=1, day=1)
     data_directory = settings.PP_FINLAND_OLD_DATA
     es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')

diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py
@@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
     title = "People & Parliament (Netherlands)"
     description = "Speeches from the Eerste Kamer and Tweede Kamer"
     min_date = datetime(year=1815, month=1, day=1)
-    max_date = datetime(year=2020, month=12, day=31)
+    max_date = datetime(year=2022, month=12, day=31)
     data_directory = settings.PP_NL_DATA
     data_directory_recent = settings.PP_NL_RECENT_DATA
     word_model_path = getattr(settings, 'PP_NL_WM', None)

diff --git a/backend/download/conftest.py b/backend/download/conftest.py
@@ -50,18 +50,22 @@ def index_ml_mock_corpus(es_client, ml_mock_corpus):
 def index_mock_corpus(es_client, mock_corpus, index_small_mock_corpus, index_large_mock_corpus, index_ml_mock_corpus):
     yield mock_corpus
 
-def save_all_results_csv(mock_corpus, mock_corpus_specs):
+def all_results_request_json(mock_corpus, mock_corpus_specs):
     fields = mock_corpus_specs['fields']
     query = mock_corpus_specs['example_query']
 
-    request_json = {
+    return {
         'corpus': mock_corpus,
         'es_query': MATCH_ALL,
         'fields': fields,
         'route': '/search/{};query={}'.format(mock_corpus, query)
     }
+
+def save_all_results_csv(mock_corpus, mock_corpus_specs):
+    request_json = all_results_request_json(mock_corpus, mock_corpus_specs)
     results = tasks.download_scroll(request_json)
-    filename = tasks.make_csv(results, request_json)
+    fake_id = mock_corpus + '_all_results'
+    filename = tasks.make_csv(results, request_json, fake_id)
 
     return filename
 

diff --git a/backend/download/create_csv.py b/backend/download/create_csv.py
@@ -22,12 +22,10 @@ def write_file(filename, fieldnames, rows, dialect = 'excel'):
 
     return filepath
 
-def create_filename(descriptive_part, essential_suffix = '.csv'):
-    max_length = 255 - (len(essential_suffix) + len(settings.CSV_FILES_PATH))
-    truncated = descriptive_part[:min(max_length, len(descriptive_part))]
-    return truncated + essential_suffix
+def create_filename(download_id):
+    return f'{download_id}.csv'
 
-def search_results_csv(results, fields, query):
+def search_results_csv(results, fields, query, download_id):
     entries = []
     field_set = set(fields)
     field_set.update(['query'])
@@ -51,14 +49,14 @@ def search_results_csv(results, fields, query):
                     entry.update({highlight_field_name: soup.get_text()})
         entries.append(entry)
 
-    filename = create_filename(query)
+    filename = create_filename(download_id)
     field_set.discard('context')
     fieldnames = sorted(field_set)
     filepath = write_file(filename, fieldnames, entries, dialect = 'resultsDialect')
     return filepath
 
 
-def term_frequency_csv(queries, results, field_name, unit = None):
+def term_frequency_csv(queries, results, field_name, download_id, unit = None):
     has_token_counts = results[0].get('token_count', None) != None
     query_column = ['Query'] if len(queries) > 1 else []
     freq_columns = ['Term frequency', 'Relative term frequency (by # documents)', 'Total documents']
@@ -67,17 +65,10 @@ def term_frequency_csv(queries, results, field_name, unit = None):
 
     rows = term_frequency_csv_rows(queries, results, field_name, unit)
 
-    filename = term_frequency_filename(queries, field_name)
+    filename = create_filename(download_id)
     filepath = write_file(filename, fieldnames, rows)
     return filepath
 
-def term_frequency_filename(queries, field_name):
-    querystring = '_'.join(queries)
-    timestamp = datetime.now().isoformat(sep='_', timespec='minutes') # ensure csv filenames are unique with timestamp
-    suffix = '_' + timestamp + '.csv'
-    description = 'term_frequency_{}_{}'.format(field_name, querystring)
-    return create_filename(description, suffix)
-
 def term_frequency_csv_rows(queries, results, field_name, unit):
     for result in results:
         field_value = format_field_value(result['key'], unit)

diff --git a/backend/download/mail.py b/backend/download/mail.py
@@ -20,8 +20,8 @@ def send_csv_email(user_email, username, download_id):
 
     subject = 'I-Analyzer CSV download'
     from_email = settings.DEFAULT_FROM_EMAIL
-    path = Download.objects.get(id=download_id).filename
-    _, filename = os.path.split(path)
+    download = Download.objects.get(id=download_id)
+    filename = download.descriptive_filename()
 
     context = {
         'email_title': 'Download CSV',

diff --git a/backend/download/models.py b/backend/download/models.py
@@ -53,3 +53,10 @@ def complete(self, filename = None):
         self.filename = filename
         self.completed = timezone.now()
         self.save()
+
+    def descriptive_filename(self):
+        corpus_name = self.corpus.name
+        type_name = self.download_type
+        timestamp = self.completed.strftime('%Y-%m-%d %H:%M')
+
+        return f'{type_name}__{corpus_name}__{timestamp}.csv'