Skip to content

Commit

Permalink
Merge branch 'develop' into feature/full-ngram
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Oct 26, 2023
2 parents ed49010 + c1b99ae commit 1369c43
Show file tree
Hide file tree
Showing 68 changed files with 870 additions and 146 deletions.
36 changes: 36 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
name: Bug report
about: Let us know about something that isn't working right
title: ''
labels: bug
assignees: ''

---

### What went wrong?

Describe what happened.

### Expected behavior

What did you expect to happen?

### Screenshots

If applicable, please add a screenshot of the problem!

### Which version?

Please specify where you encountered the issue:

- [ ] https://ianalyzer.hum.uu.nl
- [ ] https://peopleandparliament.hum.uu.nl
- [ ] https://peace.sites.uu.nl/
- [ ] a server hosted elsewhere (i.e. not by the research software lab)
- [ ] a local server

If this happened on local or third-party server, it helps if you can be more specific about the version. Please include the version number (e.g. "3.2.4") or a commit hash if you know it!

### To reproduce

How can a developer replicate the issue? Please provide any information you can. For example: "I went to https://ianalyzer.hum.uu.nl/search/troonredes?date=1814-01-01:1972-01-01 and then clicked on *Download CSV*. I pressed *cancel* and then I clicked *Download CSV* again."
20 changes: 20 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_request.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for something new
title: ''
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.
2 changes: 1 addition & 1 deletion backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
'''
Mapping for the main content field. Options:
Expand Down
8 changes: 7 additions & 1 deletion backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication
validate_visualizations_with_mapping, validate_implication, any_date_fields, visualisations_require_date_field

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
Expand Down Expand Up @@ -269,3 +269,9 @@ def clean(self):
validate_implication(self.search_field_core, self.searchable, "Core search fields must be searchable")
except ValidationError as e:
warnings.warn(e.message)

validate_implication(
self.visualizations, self.corpus_configuration.fields.all(),
'The ngram visualisation requires a date field on the corpus',
visualisations_require_date_field, any_date_fields,
)
32 changes: 31 additions & 1 deletion backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping
from addcorpus.models import Field
from addcorpus.es_mappings import int_mapping, text_mapping, keyword_mapping, main_content_mapping, date_mapping
from addcorpus.validators import *

def test_validate_mimetype():
Expand Down Expand Up @@ -71,3 +72,32 @@ def test_filename_validation():
with pytest.raises(ValidationError):
validate_image_filename_extension('image.txt')

def test_validate_ngram_has_date_field():
text_field = Field(
name='content',
es_mapping=main_content_mapping(),
visualizations=['wordcloud', 'ngram']
)

date_field = Field(
name='date',
es_mapping=date_mapping()
)

with_date_field = [text_field, date_field]
without_date_field = [text_field]

validate_implication(
text_field.visualizations, with_date_field,
'',
visualisations_require_date_field,
any_date_fields
)

with pytest.raises(ValidationError):
validate_implication(
text_field.visualizations, without_date_field,
'',
visualisations_require_date_field,
any_date_fields
)
7 changes: 7 additions & 0 deletions backend/addcorpus/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,10 @@ def validate_markdown_filename_extension(filename):
def validate_image_filename_extension(filename):
allowed = ['.jpeg', '.jpg', '.png', '.JPG']
validate_filename_extension(filename, allowed)

def any_date_fields(fields):
is_date = lambda field: primary_mapping_type(field.es_mapping) == 'date'
return any(map(is_date, fields))

def visualisations_require_date_field(visualisations):
return visualisations and 'ngram' in visualisations
41 changes: 29 additions & 12 deletions backend/api/tests/test_api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,11 @@
from addcorpus.models import Corpus
from rest_framework.status import is_success

def test_search_history_view(admin_user, admin_client):
corpus = Corpus.objects.create(name = 'mock-corpus')

# get search history
response = admin_client.get('/api/search_history/')
assert is_success(response.status_code)
assert len(response.data) == 0

# add a query to search history
data = {
def mock_query_data(user, corpus_name):
return {
'aborted': False,
'corpus': corpus.name,
'user': admin_user.id,
'corpus': corpus_name,
'user': user.id,
'started': datetime.now().isoformat(),
'completed': datetime.now().isoformat(),
'query_json': {
Expand All @@ -25,6 +17,17 @@ def test_search_history_view(admin_user, admin_client):
'total_results': 10,
'transferred': 0,
}

def test_search_history_view(admin_user, admin_client):
corpus = Corpus.objects.create(name = 'mock-corpus')

# get search history
response = admin_client.get('/api/search_history/')
assert is_success(response.status_code)
assert len(response.data) == 0

# add a query to search history
data = mock_query_data(admin_user, 'mock-corpus')
response = admin_client.post('/api/search_history/', data, content_type='application/json')
assert is_success(response.status_code)

Expand All @@ -34,6 +37,20 @@ def test_search_history_view(admin_user, admin_client):
assert len(response.data) == 1


def test_delete_search_history(auth_client, auth_user, db):
mock_corpus = 'mock-corpus'
corpus = Corpus.objects.create(name = mock_corpus)
query = mock_query_data(auth_user, mock_corpus)
auth_client.post('/api/search_history/', query, content_type='application/json')

assert len(auth_user.queries.all()) == 1

response = auth_client.post('/api/search_history/delete_all/')
assert is_success(response.status_code)

assert len(auth_user.queries.all()) == 0


def test_task_status_view(transactional_db, admin_client, celery_worker):
bad_request = {
'bad_key': 'data'
Expand Down
8 changes: 7 additions & 1 deletion backend/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from api.serializers import QuerySerializer
from rest_framework.permissions import IsAuthenticated
from rest_framework.exceptions import APIException
from rest_framework.decorators import action
import logging
from rest_framework.permissions import IsAuthenticated
from api.utils import check_json_keys
from celery import current_app as celery_app

Expand All @@ -23,6 +23,12 @@ class QueryViewset(viewsets.ModelViewSet):
def get_queryset(self):
return self.request.user.queries.all()

@action(detail=False, methods=['post'])
def delete_all(self, request):
queries = self.get_queryset()
queries.delete()
return Response('success')

class TaskStatusView(APIView):
'''
Get the status of an array of backend tasks (working/done/failed),
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/dbnl/dbnl.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def _xml_files(self):
transform_soup_func=utils.pad_content,
),
es_mapping=main_content_mapping(token_counts=True),
visualizations=['wordcloud', 'ngram'],
visualizations=['wordcloud'],
)

has_content = FieldDefinition(
Expand Down
6 changes: 5 additions & 1 deletion backend/corpora/dutchannualreports/dutchannualreports.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
from addcorpus.load_corpus import corpus_dir

from addcorpus.es_mappings import keyword_mapping, main_content_mapping
from addcorpus.es_settings import es_settings

from media.media_url import media_url

Expand Down Expand Up @@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):

dutchannualreports_map = {}

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
reader = csv.DictReader(f)
for line in reader:
Expand Down
9 changes: 4 additions & 5 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
description_page = 'ecco.md'
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

data_directory = settings.ECCO_DATA
es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
image = 'ecco.jpg'
Expand All @@ -47,6 +42,10 @@ def es_settings(self):

meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')

@property
def es_settings(self):
return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)

def sources(self, start=min_date, end=max_date):
logging.basicConfig(filename='ecco.log', level=logging.INFO)

Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/finland-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
title = 'People and Parliament (Finland, 1863-1905)'
description = 'Speeches from the early Finnish estates'
max_date = datetime(year=1905, month=12, day=31)
max_date = datetime(year=1906, month=12, day=31)
min_date = datetime(year=1863, month=1, day=1)
data_directory = settings.PP_FINLAND_OLD_DATA
es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/netherlands.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
title = "People & Parliament (Netherlands)"
description = "Speeches from the Eerste Kamer and Tweede Kamer"
min_date = datetime(year=1815, month=1, day=1)
max_date = datetime(year=2020, month=12, day=31)
max_date = datetime(year=2022, month=12, day=31)
data_directory = settings.PP_NL_DATA
data_directory_recent = settings.PP_NL_RECENT_DATA
word_model_path = getattr(settings, 'PP_NL_WM', None)
Expand Down
10 changes: 7 additions & 3 deletions backend/download/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,22 @@ def index_ml_mock_corpus(es_client, ml_mock_corpus):
def index_mock_corpus(es_client, mock_corpus, index_small_mock_corpus, index_large_mock_corpus, index_ml_mock_corpus):
yield mock_corpus

def save_all_results_csv(mock_corpus, mock_corpus_specs):
def all_results_request_json(mock_corpus, mock_corpus_specs):
fields = mock_corpus_specs['fields']
query = mock_corpus_specs['example_query']

request_json = {
return {
'corpus': mock_corpus,
'es_query': MATCH_ALL,
'fields': fields,
'route': '/search/{};query={}'.format(mock_corpus, query)
}

def save_all_results_csv(mock_corpus, mock_corpus_specs):
request_json = all_results_request_json(mock_corpus, mock_corpus_specs)
results = tasks.download_scroll(request_json)
filename = tasks.make_csv(results, request_json)
fake_id = mock_corpus + '_all_results'
filename = tasks.make_csv(results, request_json, fake_id)

return filename

Expand Down
21 changes: 6 additions & 15 deletions backend/download/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@ def write_file(filename, fieldnames, rows, dialect = 'excel'):

return filepath

def create_filename(descriptive_part, essential_suffix = '.csv'):
max_length = 255 - (len(essential_suffix) + len(settings.CSV_FILES_PATH))
truncated = descriptive_part[:min(max_length, len(descriptive_part))]
return truncated + essential_suffix
def create_filename(download_id):
return f'{download_id}.csv'

def search_results_csv(results, fields, query):
def search_results_csv(results, fields, query, download_id):
entries = []
field_set = set(fields)
field_set.update(['query'])
Expand All @@ -51,14 +49,14 @@ def search_results_csv(results, fields, query):
entry.update({highlight_field_name: soup.get_text()})
entries.append(entry)

filename = create_filename(query)
filename = create_filename(download_id)
field_set.discard('context')
fieldnames = sorted(field_set)
filepath = write_file(filename, fieldnames, entries, dialect = 'resultsDialect')
return filepath


def term_frequency_csv(queries, results, field_name, unit = None):
def term_frequency_csv(queries, results, field_name, download_id, unit = None):
has_token_counts = results[0].get('token_count', None) != None
query_column = ['Query'] if len(queries) > 1 else []
freq_columns = ['Term frequency', 'Relative term frequency (by # documents)', 'Total documents']
Expand All @@ -67,17 +65,10 @@ def term_frequency_csv(queries, results, field_name, unit = None):

rows = term_frequency_csv_rows(queries, results, field_name, unit)

filename = term_frequency_filename(queries, field_name)
filename = create_filename(download_id)
filepath = write_file(filename, fieldnames, rows)
return filepath

def term_frequency_filename(queries, field_name):
querystring = '_'.join(queries)
timestamp = datetime.now().isoformat(sep='_', timespec='minutes') # ensure csv filenames are unique with timestamp
suffix = '_' + timestamp + '.csv'
description = 'term_frequency_{}_{}'.format(field_name, querystring)
return create_filename(description, suffix)

def term_frequency_csv_rows(queries, results, field_name, unit):
for result in results:
field_value = format_field_value(result['key'], unit)
Expand Down
4 changes: 2 additions & 2 deletions backend/download/mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def send_csv_email(user_email, username, download_id):

subject = 'I-Analyzer CSV download'
from_email = settings.DEFAULT_FROM_EMAIL
path = Download.objects.get(id=download_id).filename
_, filename = os.path.split(path)
download = Download.objects.get(id=download_id)
filename = download.descriptive_filename()

context = {
'email_title': 'Download CSV',
Expand Down
7 changes: 7 additions & 0 deletions backend/download/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ def complete(self, filename = None):
self.filename = filename
self.completed = timezone.now()
self.save()

def descriptive_filename(self):
corpus_name = self.corpus.name
type_name = self.download_type
timestamp = self.completed.strftime('%Y-%m-%d %H:%M')

return f'{type_name}__{corpus_name}__{timestamp}.csv'
Loading

0 comments on commit 1369c43

Please sign in to comment.