diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
index a2f58418f..54bddbaa0 100644
--- a/backend/addcorpus/es_mappings.py
+++ b/backend/addcorpus/es_mappings.py
@@ -1,4 +1,4 @@
-def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False):
+def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True):
'''
Mapping for the main content field. Options:
diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
index d4f4c7038..6a7c89168 100644
--- a/backend/corpora/dutchannualreports/dutchannualreports.py
+++ b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -12,8 +12,8 @@
from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition
from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf
from addcorpus.load_corpus import corpus_dir
-
from addcorpus.es_mappings import keyword_mapping, main_content_mapping
+from addcorpus.es_settings import es_settings
from media.media_url import media_url
@@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition):
dutchannualreports_map = {}
+ @property
+ def es_settings(self):
+ return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f:
reader = csv.DictReader(f)
for line in reader:
diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py
index d23ef196b..a6b517dac 100644
--- a/backend/corpora/ecco/ecco.py
+++ b/backend/corpora/ecco/ecco.py
@@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition):
description_page = 'ecco.md'
min_date = datetime(year=1700, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)
-
- @property
- def es_settings(self):
- return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
-
data_directory = settings.ECCO_DATA
es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco')
image = 'ecco.jpg'
@@ -47,6 +42,10 @@ def es_settings(self):
meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')
+ @property
+ def es_settings(self):
+ return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True)
+
def sources(self, start=min_date, end=max_date):
logging.basicConfig(filename='ecco.log', level=logging.INFO)
diff --git a/backend/corpora/parliament/finland-old.py b/backend/corpora/parliament/finland-old.py
index 59bf354c6..7e654b941 100644
--- a/backend/corpora/parliament/finland-old.py
+++ b/backend/corpora/parliament/finland-old.py
@@ -14,7 +14,7 @@
class ParliamentFinlandOld(Parliament, CSVCorpusDefinition):
title = 'People and Parliament (Finland, 1863-1905)'
description = 'Speeches from the early Finnish estates'
- max_date = datetime(year=1905, month=12, day=31)
+ max_date = datetime(year=1906, month=12, day=31)
min_date = datetime(year=1863, month=1, day=1)
data_directory = settings.PP_FINLAND_OLD_DATA
es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old')
diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py
index 49f4ba5a1..7e3f51e6b 100644
--- a/backend/corpora/parliament/netherlands.py
+++ b/backend/corpora/parliament/netherlands.py
@@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition):
title = "People & Parliament (Netherlands)"
description = "Speeches from the Eerste Kamer and Tweede Kamer"
min_date = datetime(year=1815, month=1, day=1)
- max_date = datetime(year=2020, month=12, day=31)
+ max_date = datetime(year=2022, month=12, day=31)
data_directory = settings.PP_NL_DATA
data_directory_recent = settings.PP_NL_RECENT_DATA
word_model_path = getattr(settings, 'PP_NL_WM', None)
diff --git a/backend/download/create_csv.py b/backend/download/create_csv.py
index f0f93b8e9..b807f7f10 100644
--- a/backend/download/create_csv.py
+++ b/backend/download/create_csv.py
@@ -5,6 +5,7 @@
from django.conf import settings
+from visualization.query import get_query_text
from visualization.term_frequency import parse_datestring
def write_file(filename, fieldnames, rows, dialect = 'excel'):
@@ -99,4 +100,22 @@ def format_field_value(value, unit):
'week': '%Y-%m-%d',
'day': '%Y-%m-%d'
}
- return date.strftime(formats[unit])
+ return date.strftime(formats[unit])
+
+def ngram_csv(results, log_id):
+ rows = ngram_table(results)
+ fieldnames = ['date', 'N-gram', 'Frequency']
+ filename = create_filename(log_id)
+ filepath = write_file(filename, fieldnames, rows)
+ return filepath
+
+def ngram_table(results):
+ rows = []
+ for index, time_point in enumerate(results['time_points']):
+ for ngram in results['words']:
+ rows.append({
+ 'date': time_point,
+ 'N-gram': ngram['label'],
+ 'Frequency': ngram['data'][index]
+ })
+ return rows
diff --git a/backend/download/migrations/0002_alter_download_download_type.py b/backend/download/migrations/0002_alter_download_download_type.py
new file mode 100644
index 000000000..dd44e9d2f
--- /dev/null
+++ b/backend/download/migrations/0002_alter_download_download_type.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.10 on 2023-10-18 12:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('download', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='download',
+ name='download_type',
+ field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
+ ),
+ ]
diff --git a/backend/download/models.py b/backend/download/models.py
index 2e5e9bcd1..3afe3074f 100644
--- a/backend/download/models.py
+++ b/backend/download/models.py
@@ -1,8 +1,9 @@
from django.db import models
+from django.conf import settings
+from django.utils import timezone
+
from users.models import CustomUser
from addcorpus.models import Corpus
-from django.conf import settings
-from datetime import datetime
MAX_LENGTH_FILENAME = 254
@@ -17,6 +18,7 @@ class Download(models.Model):
('search_results', 'Search results'),
('date_term_frequency', 'Term frequency (timeline)'),
('aggregate_term_frequency', 'Term frequency (histogram)'),
+ ('ngram', 'Neighbouring words')
],
help_text='Type of download (search results or a type of visualisation)')
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
@@ -49,7 +51,7 @@ def complete(self, filename = None):
'''
self.filename = filename
- self.completed = datetime.now()
+ self.completed = timezone.now()
self.save()
def descriptive_filename(self):
diff --git a/backend/download/tasks.py b/backend/download/tasks.py
index a47340ba6..ecc9b5f0c 100644
--- a/backend/download/tasks.py
+++ b/backend/download/tasks.py
@@ -2,13 +2,12 @@
import re
from django.conf import settings
from celery import shared_task, chain, group
-from django.urls import reverse
from es import download as es_download
from download import create_csv
from download.models import Download
from addcorpus.models import Corpus
-from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
+from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
from visualization import query
from download.mail import send_csv_email
@@ -90,10 +89,12 @@ def download_search_results(request_json, user):
return try_download(make_chain, download)
@shared_task()
-def make_term_frequency_csv(results_per_series, parameters_per_series, log_id):
+def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id):
'''
Export term frequency results to a csv.
'''
+ if visualization_type == 'ngram':
+ return create_csv.ngram_csv(results_per_series, log_id)
query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit)
@@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
task_function(series_parameters, True) for series_parameters in parameters_unlimited
)
+def ngram_full_data_tasks(ngram_parameters, dummy):
+ ngram_parameters['max_size_per_interval'] = None
+ return ngram_data_tasks(ngram_parameters)
+
def extract_term_frequency_download_metadata(parameters_per_series):
'''
Get some relevant metadata for a term frequency request:
@@ -148,16 +153,16 @@ def download_full_data(request_json, user):
'''
Download the full data for a visualisation
'''
-
visualization_type = request_json['visualization']
task_per_type = {
'date_term_frequency': term_frequency_full_data_tasks,
- 'aggregate_term_frequency': term_frequency_full_data_tasks
+ 'aggregate_term_frequency': term_frequency_full_data_tasks,
+ 'ngram': ngram_full_data_tasks,
}
parameters = request_json['parameters']
- corpus_name = request_json['corpus']
+ corpus_name = request_json['corpus_name']
corpus = Corpus.objects.get(name=corpus_name)
task = task_per_type[visualization_type](parameters, visualization_type)
@@ -166,7 +171,7 @@ def download_full_data(request_json, user):
make_chain = lambda : chain(
task,
- make_term_frequency_csv.s(parameters, download.id),
+ make_full_data_csv.s(visualization_type, parameters, download.id),
complete_download.s(download.id),
csv_data_email.s(user.email, user.username),
).on_error(complete_failed_download.s(download.id))
diff --git a/backend/download/tests/test_csv_results.py b/backend/download/tests/test_csv_results.py
index f33a896ef..6abc899e7 100644
--- a/backend/download/tests/test_csv_results.py
+++ b/backend/download/tests/test_csv_results.py
@@ -208,3 +208,26 @@ def test_date_format():
for value, unit, expected in cases:
assert create_csv.format_field_value(value, unit) == expected
+
+
+mock_ngram_data = {
+ 'words': [
+ {'label': 'ex parrot', 'data': [2, 3]},
+ {'label': 'this parrot what', 'data': [4, 8]},
+ {'label': 'dead parrot when', 'data': [4, 6]},
+ ],
+ 'time_points': ['1960-1965', '1962-1967']
+}
+
+expected_csv_table = [
+ {'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
+ {'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
+ {'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
+ {'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
+ {'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
+ {'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
+]
+
+def test_ngram_table():
+ table = create_csv.ngram_table(mock_ngram_data)
+ assert table == expected_csv_table
\ No newline at end of file
diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py
index 6d9993815..42607a238 100644
--- a/backend/download/tests/test_download_views.py
+++ b/backend/download/tests/test_download_views.py
@@ -6,7 +6,7 @@
from download import SEARCH_RESULTS_DIALECT
from addcorpus.models import Corpus
import io
-from visualization.query import MATCH_ALL
+from visualization import query
from es.search import hits
from tag.models import Tag, TaggedDocument
@@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
# TODO: construct query from query module, which is much more convenient
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
- query = {
- "query": {
- "bool": {
- "must": {
- "simple_query_string": {
- "query": query_text,
- "fields": [search_field],
- "lenient": True,
- "default_operator": "or"
- }
- },
- "filter": []
- }
- }
- }
+ query = mock_es_query(query_text, search_field)
return {
'es_query': query,
'corpus_name': mock_corpus,
@@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
'unit': 'year',
}
+def ngram_parameters(mock_corpus, mock_corpus_specs):
+ query_text = mock_corpus_specs['example_query']
+ search_field = mock_corpus_specs['content_field']
+ return {
+ 'corpus_name': mock_corpus,
+ 'es_query': mock_es_query(query_text, search_field),
+ 'field': search_field,
+ 'ngram_size': 2,
+ 'term_position': 'any',
+ 'freq_compensation': True,
+ 'subfield': 'clean',
+ 'max_size_per_interval': 50,
+ 'number_of_ngrams': 10,
+ 'date_field': 'date'
+ }
+
+def mock_es_query(query_text, search_field):
+ q = query.MATCH_ALL
+ q = query.set_query_text(q, query_text)
+ q = query.set_search_fields(q, [search_field])
+ return q
+
+@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
- csv_directory):
- parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
+ csv_directory, visualization_type, request_parameters):
+ parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
+ if visualization_type != 'ngram':
+ # timeline and histogram expect a series of parameters
+ parameters = [parameters]
request_json = {
- 'visualization': 'date_term_frequency',
- 'parameters': [parameters],
- 'corpus': small_mock_corpus
+ 'visualization': visualization_type,
+ 'parameters': parameters,
+ 'corpus_name': small_mock_corpus
}
response = admin_client.post(
'/api/download/full_data',
@@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download):
def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus):
search_response = admin_client.post(
f'/api/es/{small_mock_corpus}/_search',
- {'es_query': MATCH_ALL},
+ {'es_query': query.MATCH_ALL},
content_type='application/json'
)
@@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock
encoding = 'utf-8'
download_request_json = {
'corpus': small_mock_corpus,
- 'es_query': MATCH_ALL,
+ 'es_query': query.MATCH_ALL,
'tags': [tag_on_some_document.id],
'fields': ['date','content'],
'size': 3,
diff --git a/backend/download/tests/test_full_data.py b/backend/download/tests/test_full_data.py
index 47e553310..b3334ee39 100644
--- a/backend/download/tests/test_full_data.py
+++ b/backend/download/tests/test_full_data.py
@@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
],
'unit': 'year'
}]
-
- group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
+ visualization_type = 'date_term_frequency'
+ group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
results = group.apply().get()
log_id = 0 # fake ID
- filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id)
+ filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id)
with open(filename) as f:
reader = csv.DictReader(f)
diff --git a/backend/download/views.py b/backend/download/views.py
index 0e40bfa67..3d4a4e326 100644
--- a/backend/download/views.py
+++ b/backend/download/views.py
@@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView):
permission_classes = [IsAuthenticated, CorpusAccessPermission]
def post(self, request, *args, **kwargs):
- check_json_keys(request, ['visualization', 'parameters', 'corpus'])
+ check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])
visualization_type = request.data['visualization']
- known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
+ known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
if visualization_type not in known_visualisations:
raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')
diff --git a/backend/es/download.py b/backend/es/download.py
index d4da32a63..49e2c5614 100644
--- a/backend/es/download.py
+++ b/backend/es/download.py
@@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
result = search(
corpus = corpus,
query_model=query_model,
- size = size,
+ size=size,
)
return hits(result)
diff --git a/backend/visualization/ngram.py b/backend/visualization/ngram.py
index 3e568a612..07d54108b 100644
--- a/backend/visualization/ngram.py
+++ b/backend/visualization/ngram.py
@@ -5,6 +5,7 @@
from addcorpus.models import CorpusConfiguration
from datetime import datetime
from es.search import get_index, search
+from es.download import scroll
from ianalyzer.elasticsearch import elasticsearch
from visualization import query, termvectors
@@ -50,7 +51,7 @@ def get_total_time_interval(es_query, corpus):
def get_time_bins(es_query, corpus):
"""Wide bins for a query. Depending on the total time range of the query, time intervervals are
10 years (>100 yrs), 5 years (100-20 yrs) of 1 year (<20 yrs)."""
-
+
min_date, max_date = get_total_time_interval(es_query, corpus)
min_year, max_year = min_date.year, max_date.year
time_range = max_year - min_year
@@ -77,9 +78,9 @@ def get_time_bins(es_query, corpus):
return bins
-def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field):
- index = get_index(corpus)
- client = elasticsearch(corpus)
+def tokens_by_time_interval(corpus_name, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field, **kwargs):
+ index = get_index(corpus_name)
+ client = elasticsearch(corpus_name)
positions_dict = {
'any': list(range(ngram_size)),
'first': [0],
@@ -100,21 +101,21 @@ def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_posit
date_filter = query.make_date_filter(start_date, end_date, date_field)
narrow_query = query.add_filter(es_query, date_filter)
#search for the query text
- search_results = search(
- corpus=corpus,
- query_model = narrow_query,
- client = client,
- size = max_size_per_interval,
+ search_results, _total = scroll(
+ corpus=corpus_name,
+ query_model=narrow_query,
+ client=client,
+ download_size=max_size_per_interval,
)
bin_ngrams = Counter()
- for hit in search_results['hits']['hits']:
+ for hit in search_results:
identifier = hit['_id']
# get the term vectors for the hit
result = client.termvectors(
index=index,
id=identifier,
term_statistics=freq_compensation,
- fields = [field]
+ fields=[field]
)
terms = termvectors.get_terms(result, field)
if terms:
diff --git a/backend/visualization/tasks.py b/backend/visualization/tasks.py
index 3b7dcf88a..51658115d 100644
--- a/backend/visualization/tasks.py
+++ b/backend/visualization/tasks.py
@@ -25,7 +25,7 @@ def ngram_data_tasks(request_json):
return chord(group([
get_ngram_data_bin.s(
- corpus=corpus,
+ corpus_name=corpus,
es_query=es_query,
field=request_json['field'],
bin=b,
@@ -40,7 +40,7 @@ def ngram_data_tasks(request_json):
]), integrate_ngram_results.s(
number_of_ngrams=request_json['number_of_ngrams']
)
- )()
+ )
@shared_task()
def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False):
diff --git a/backend/visualization/tests/test_ngrams.py b/backend/visualization/tests/test_ngrams.py
index 93de4bbdb..16a797d3d 100644
--- a/backend/visualization/tests/test_ngrams.py
+++ b/backend/visualization/tests/test_ngrams.py
@@ -111,10 +111,10 @@ def test_top_10_ngrams():
for w in target_data }
assert dataset_relative['data'] == relative_frequencies[word]
-def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
+def get_binned_results(corpus_name, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
return [
ngram.tokens_by_time_interval(
- corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
+ corpus_name, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
for bin in time_bins
]
diff --git a/backend/visualization/views.py b/backend/visualization/views.py
index 034a7d584..c608b784e 100644
--- a/backend/visualization/views.py
+++ b/backend/visualization/views.py
@@ -57,7 +57,7 @@ def post(self, request, *args, **kwargs):
try:
handle_tags_in_request(request)
- chord = tasks.ngram_data_tasks(request.data)
+ chord = tasks.ngram_data_tasks(request.data)()
subtasks = [chord, *chord.parent.children]
return Response({'task_ids': [task.id for task in subtasks]})
except Exception as e:
diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html
index dde15325d..c7505664b 100644
--- a/frontend/src/app/document-view/document-view.component.html
+++ b/frontend/src/app/document-view/document-view.component.html
@@ -29,7 +29,7 @@
-
+
diff --git a/frontend/src/app/document-view/document-view.component.spec.ts b/frontend/src/app/document-view/document-view.component.spec.ts
index 863dd2631..6fa470a3a 100644
--- a/frontend/src/app/document-view/document-view.component.spec.ts
+++ b/frontend/src/app/document-view/document-view.component.spec.ts
@@ -41,4 +41,11 @@ describe('DocumentViewComponent', () => {
const element = debug[0].nativeElement;
expect(element.textContent).toBe('Hello world!');
});
+
+ it('should create tabs', () => {
+ const debug = fixture.debugElement.queryAll(By.css('a[role=tab]'));
+ expect(debug.length).toBe(2);
+ expect(debug[0].attributes['id']).toBe('tab-speech');
+ expect(debug[1].attributes['id']).toBe('tab-scan');
+ });
});
diff --git a/frontend/src/app/history/download-history/download-history.component.ts b/frontend/src/app/history/download-history/download-history.component.ts
index 24b495aab..bf926fdcb 100644
--- a/frontend/src/app/history/download-history/download-history.component.ts
+++ b/frontend/src/app/history/download-history/download-history.component.ts
@@ -39,7 +39,8 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit
const displayNames = {
search_results: 'Search results',
date_term_frequency: 'Term frequency',
- aggregate_term_frequency: 'Term frequency'
+ aggregate_term_frequency: 'Term frequency',
+ ngram: 'Neighbouring words'
// timeline/histogram distinction is relevant for backend but not for the user
};
return displayNames[type];
diff --git a/frontend/src/app/models/search-results.ts b/frontend/src/app/models/search-results.ts
index 150df3c74..a9d5edbff 100644
--- a/frontend/src/app/models/search-results.ts
+++ b/frontend/src/app/models/search-results.ts
@@ -102,7 +102,7 @@ export type TermFrequencyDownloadParameters = DateTermFrequencyParameters[] | Ag
export type LimitedResultsDownloadParameters = ResultsDownloadParameters & { size: number } & DownloadOptions;
-export type DownloadType = 'search_results' | 'aggregate_term_frequency' | 'date_term_frequency';
+export type DownloadType = 'search_results' | 'aggregate_term_frequency' | 'date_term_frequency' | 'ngram';
export type DownloadStatus = 'done' | 'working' | 'error';
export type DownloadParameters = TermFrequencyDownloadParameters | ResultsDownloadParameters;
diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts
index e50e73b0b..47ced1eb4 100644
--- a/frontend/src/app/services/api.service.ts
+++ b/frontend/src/app/services/api.service.ts
@@ -175,12 +175,18 @@ export class ApiService {
| {
visualization: 'date_term_frequency';
parameters: DateTermFrequencyParameters[];
- corpus: string;
+ corpus_name: string;
}
| {
visualization: 'aggregate_term_frequency';
parameters: AggregateTermFrequencyParameters[];
- corpus: string;
+ corpus_name: string;
+ }
+ |
+ {
+ visualization: 'ngram';
+ parameters: NGramRequestParameters;
+ corpus_name: string;
}
): Promise {
const url = this.apiRoute(this.downloadApiURL, 'full_data');
diff --git a/frontend/src/app/services/corpus.service.spec.ts b/frontend/src/app/services/corpus.service.spec.ts
index 26364246a..3d4d30dff 100644
--- a/frontend/src/app/services/corpus.service.spec.ts
+++ b/frontend/src/app/services/corpus.service.spec.ts
@@ -199,6 +199,8 @@ describe('CorpusService', () => {
expect(items.length).toBe(1);
const corpus = _.first(items);
+ expect(corpus.scan_image_type).toBe('png');
+
const fieldData = [
{
description: 'Banking concern to which the report belongs.',
@@ -275,6 +277,7 @@ describe('CorpusService', () => {
expect(result[key]).toEqual(expected[key]);
});
});
+
});
});
});
diff --git a/frontend/src/app/services/visualization.service.ts b/frontend/src/app/services/visualization.service.ts
index 5b3d4245d..945f379bf 100644
--- a/frontend/src/app/services/visualization.service.ts
+++ b/frontend/src/app/services/visualization.service.ts
@@ -4,13 +4,13 @@ import {
AggregateTermFrequencyParameters,
Corpus,
DateTermFrequencyParameters,
+ NGramRequestParameters,
NgramParameters,
QueryModel,
TaskResult,
TimeCategory,
} from '../models';
import { ApiService } from './api.service';
-import { ElasticSearchService } from './elastic-search.service';
@Injectable({
providedIn: 'root'
@@ -71,17 +71,14 @@ export class VisualizationService {
};
}
- public async dateTermFrequencySearch(
- corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {size: number; start_date: Date; end_date?: Date}[],
- unit: TimeCategory,
- ): Promise {
- const params = this.makeDateTermFrequencyParameters(corpus, queryModel, fieldName, bins, unit);
- return this.apiService.getDateTermFrequency(params);
- }
-
- getNgramTasks(queryModel: QueryModel, corpus: Corpus, field: string, params: NgramParameters): Promise {
+ public makeNgramRequestParameters(
+ corpus: Corpus,
+ queryModel: QueryModel,
+ field: string,
+ params: NgramParameters
+ ): NGramRequestParameters {
const esQuery = queryModel.toEsQuery();
- return this.apiService.ngramTasks({
+ return {
es_query: esQuery,
corpus_name: corpus.name,
field,
@@ -91,8 +88,21 @@ export class VisualizationService {
subfield: params.analysis,
max_size_per_interval: params.maxDocuments,
number_of_ngrams: params.numberOfNgrams,
- date_field: params.dateField,
- });
+ date_field: params.dateField
+ };
+ }
+
+ public async dateTermFrequencySearch(
+ corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {size: number; start_date: Date; end_date?: Date}[],
+ unit: TimeCategory,
+ ): Promise {
+ const params = this.makeDateTermFrequencyParameters(corpus, queryModel, fieldName, bins, unit);
+ return this.apiService.getDateTermFrequency(params);
+ }
+
+ getNgramTasks(queryModel: QueryModel, corpus: Corpus, field: string, params: NgramParameters): Promise {
+ const ngramRequestParams = this.makeNgramRequestParameters(corpus, queryModel, field, params);
+ return this.apiService.ngramTasks(ngramRequestParams);
}
diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts
index 5570daa4f..8d5f391f9 100644
--- a/frontend/src/app/visualization/barchart/histogram.component.ts
+++ b/frontend/src/app/visualization/barchart/histogram.component.ts
@@ -87,7 +87,7 @@ export class HistogramComponent extends BarchartDirective im
return this.apiService.requestFullData({
visualization: 'aggregate_term_frequency',
parameters: paramsPerSeries,
- corpus: this.corpus.name,
+ corpus_name: this.corpus.name,
});
}
diff --git a/frontend/src/app/visualization/barchart/timeline.component.ts b/frontend/src/app/visualization/barchart/timeline.component.ts
index f8e570975..314e6296c 100644
--- a/frontend/src/app/visualization/barchart/timeline.component.ts
+++ b/frontend/src/app/visualization/barchart/timeline.component.ts
@@ -110,7 +110,7 @@ export class TimelineComponent extends BarchartDirective impl
return this.apiService.requestFullData({
visualization: 'date_term_frequency',
parameters: paramsPerSeries,
- corpus: this.corpus.name,
+ corpus_name: this.corpus.name,
});
}
diff --git a/frontend/src/app/visualization/ngram/ngram.component.html b/frontend/src/app/visualization/ngram/ngram.component.html
index 80e71570b..52fbb94ec 100644
--- a/frontend/src/app/visualization/ngram/ngram.component.html
+++ b/frontend/src/app/visualization/ngram/ngram.component.html
@@ -115,3 +115,4 @@
+
diff --git a/frontend/src/app/visualization/ngram/ngram.component.ts b/frontend/src/app/visualization/ngram/ngram.component.ts
index 5f0c432b9..479f73d0b 100644
--- a/frontend/src/app/visualization/ngram/ngram.component.ts
+++ b/frontend/src/app/visualization/ngram/ngram.component.ts
@@ -1,7 +1,7 @@
import { Component, ElementRef, EventEmitter, Input, OnChanges, Output, SimpleChanges, ViewChild } from '@angular/core';
import * as _ from 'lodash';
import { Corpus, FreqTableHeaders, QueryModel, CorpusField, NgramResults, NgramParameters } from '../../models';
-import { ApiService, ParamService, VisualizationService } from '../../services';
+import { ApiService, NotificationService, ParamService, VisualizationService } from '../../services';
import { faCheck, faTimes } from '@fortawesome/free-solid-svg-icons';
import { ParamDirective } from '../../param/param-directive';
import { ActivatedRoute, ParamMap, Params, Router } from '@angular/router';
@@ -58,9 +58,10 @@ export class NgramComponent extends ParamDirective implements OnChanges {
constructor(
private apiService: ApiService,
private visualizationService: VisualizationService,
+ private notificationService: NotificationService,
route: ActivatedRoute,
router: Router,
- paramService: ParamService
+ paramService: ParamService,
) {
super(route, router, paramService);
this.currentParameters = new NgramParameters(
@@ -251,4 +252,30 @@ export class NgramComponent extends ParamDirective implements OnChanges {
return `${value}`;
}
+
+ requestFullData() {
+ const parameters = this.visualizationService.makeNgramRequestParameters(
+ this.corpus,
+ this.queryModel,
+ this.visualizedField.name,
+ this.currentParameters
+ );
+ this.apiService.requestFullData({
+ corpus_name: this.corpus.name,
+ visualization: 'ngram',
+ parameters
+ }).then(() =>
+ this.notificationService.showMessage(
+ 'Full data requested! You will receive an email when your download is ready.',
+ 'success',
+ {
+ text: 'view downloads',
+ route: ['/download-history']
+ }
+ )
+ ).catch(error => {
+ console.error(error);
+ this.notificationService.showMessage('Could not set up data generation.', 'danger');
+ });
+ }
}