diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py index a2f58418f..54bddbaa0 100644 --- a/backend/addcorpus/es_mappings.py +++ b/backend/addcorpus/es_mappings.py @@ -1,4 +1,4 @@ -def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=False): +def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, updated_highlighting=True): ''' Mapping for the main content field. Options: diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py index d4f4c7038..6a7c89168 100644 --- a/backend/corpora/dutchannualreports/dutchannualreports.py +++ b/backend/corpora/dutchannualreports/dutchannualreports.py @@ -12,8 +12,8 @@ from addcorpus.corpus import XMLCorpusDefinition, FieldDefinition from media.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf from addcorpus.load_corpus import corpus_dir - from addcorpus.es_mappings import keyword_mapping, main_content_mapping +from addcorpus.es_settings import es_settings from media.media_url import media_url @@ -48,6 +48,10 @@ class DutchAnnualReports(XMLCorpusDefinition): dutchannualreports_map = {} + @property + def es_settings(self): + return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + with open(op.join(corpus_dir('dutchannualreports'), 'dutchannualreports_mapping.csv')) as f: reader = csv.DictReader(f) for line in reader: diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index d23ef196b..a6b517dac 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -29,11 +29,6 @@ class Ecco(XMLCorpusDefinition): description_page = 'ecco.md' min_date = datetime(year=1700, month=1, day=1) max_date = datetime(year=1800, month=12, day=31) - - @property - def es_settings(self): - return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) - data_directory = settings.ECCO_DATA es_index = getattr(settings, 'ECCO_ES_INDEX', 'ecco') image = 'ecco.jpg' @@ -47,6 +42,10 @@ def es_settings(self): meta_pattern = re.compile('^\d+\_DocMetadata\.xml$') + @property + def es_settings(self): + return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) + def sources(self, start=min_date, end=max_date): logging.basicConfig(filename='ecco.log', level=logging.INFO) diff --git a/backend/corpora/parliament/finland-old.py b/backend/corpora/parliament/finland-old.py index 59bf354c6..7e654b941 100644 --- a/backend/corpora/parliament/finland-old.py +++ b/backend/corpora/parliament/finland-old.py @@ -14,7 +14,7 @@ class ParliamentFinlandOld(Parliament, CSVCorpusDefinition): title = 'People and Parliament (Finland, 1863-1905)' description = 'Speeches from the early Finnish estates' - max_date = datetime(year=1905, month=12, day=31) + max_date = datetime(year=1906, month=12, day=31) min_date = datetime(year=1863, month=1, day=1) data_directory = settings.PP_FINLAND_OLD_DATA es_index = getattr(settings, 'PP_FINLAND_OLD_INDEX', 'parliament-finland-old') diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py index 49f4ba5a1..7e3f51e6b 100644 --- a/backend/corpora/parliament/netherlands.py +++ b/backend/corpora/parliament/netherlands.py @@ -124,7 +124,7 @@ class ParliamentNetherlands(Parliament, XMLCorpusDefinition): title = "People & Parliament (Netherlands)" description = "Speeches from the Eerste Kamer and Tweede Kamer" min_date = datetime(year=1815, month=1, day=1) - max_date = datetime(year=2020, month=12, day=31) + max_date = datetime(year=2022, month=12, day=31) data_directory = settings.PP_NL_DATA data_directory_recent = settings.PP_NL_RECENT_DATA word_model_path = getattr(settings, 'PP_NL_WM', None) diff --git a/backend/download/create_csv.py b/backend/download/create_csv.py index f0f93b8e9..b807f7f10 100644 --- a/backend/download/create_csv.py +++ b/backend/download/create_csv.py @@ -5,6 +5,7 @@ from django.conf import settings +from visualization.query import get_query_text from visualization.term_frequency import parse_datestring def write_file(filename, fieldnames, rows, dialect = 'excel'): @@ -99,4 +100,22 @@ def format_field_value(value, unit): 'week': '%Y-%m-%d', 'day': '%Y-%m-%d' } - return date.strftime(formats[unit]) + return date.strftime(formats[unit]) + +def ngram_csv(results, log_id): + rows = ngram_table(results) + fieldnames = ['date', 'N-gram', 'Frequency'] + filename = create_filename(log_id) + filepath = write_file(filename, fieldnames, rows) + return filepath + +def ngram_table(results): + rows = [] + for index, time_point in enumerate(results['time_points']): + for ngram in results['words']: + rows.append({ + 'date': time_point, + 'N-gram': ngram['label'], + 'Frequency': ngram['data'][index] + }) + return rows diff --git a/backend/download/migrations/0002_alter_download_download_type.py b/backend/download/migrations/0002_alter_download_download_type.py new file mode 100644 index 000000000..dd44e9d2f --- /dev/null +++ b/backend/download/migrations/0002_alter_download_download_type.py @@ -0,0 +1,18 @@ +# Generated by Django 4.1.10 on 2023-10-18 12:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('download', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='download', + name='download_type', + field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126), + ), + ] diff --git a/backend/download/models.py b/backend/download/models.py index 2e5e9bcd1..3afe3074f 100644 --- a/backend/download/models.py +++ b/backend/download/models.py @@ -1,8 +1,9 @@ from django.db import models +from django.conf import settings +from django.utils import timezone + from users.models import CustomUser from addcorpus.models import Corpus -from django.conf import settings -from datetime import datetime MAX_LENGTH_FILENAME = 254 @@ -17,6 +18,7 @@ class Download(models.Model): ('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), + ('ngram', 'Neighbouring words') ], help_text='Type of download (search results or a type of visualisation)') corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads') @@ -49,7 +51,7 @@ def complete(self, filename = None): ''' self.filename = filename - self.completed = datetime.now() + self.completed = timezone.now() self.save() def descriptive_filename(self): diff --git a/backend/download/tasks.py b/backend/download/tasks.py index a47340ba6..ecc9b5f0c 100644 --- a/backend/download/tasks.py +++ b/backend/download/tasks.py @@ -2,13 +2,12 @@ import re from django.conf import settings from celery import shared_task, chain, group -from django.urls import reverse from es import download as es_download from download import create_csv from download.models import Download from addcorpus.models import Corpus -from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks +from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks from visualization import query from download.mail import send_csv_email @@ -90,10 +89,12 @@ def download_search_results(request_json, user): return try_download(make_chain, download) @shared_task() -def make_term_frequency_csv(results_per_series, parameters_per_series, log_id): +def make_full_data_csv(results_per_series, visualization_type, parameters_per_series, log_id): ''' Export term frequency results to a csv. ''' + if visualization_type == 'ngram': + return create_csv.ngram_csv(results_per_series, log_id) query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series) return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, log_id, unit = unit) @@ -110,6 +111,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type): task_function(series_parameters, True) for series_parameters in parameters_unlimited ) +def ngram_full_data_tasks(ngram_parameters, dummy): + ngram_parameters['max_size_per_interval'] = None + return ngram_data_tasks(ngram_parameters) + def extract_term_frequency_download_metadata(parameters_per_series): ''' Get some relevant metadata for a term frequency request: @@ -148,16 +153,16 @@ def download_full_data(request_json, user): ''' Download the full data for a visualisation ''' - visualization_type = request_json['visualization'] task_per_type = { 'date_term_frequency': term_frequency_full_data_tasks, - 'aggregate_term_frequency': term_frequency_full_data_tasks + 'aggregate_term_frequency': term_frequency_full_data_tasks, + 'ngram': ngram_full_data_tasks, } parameters = request_json['parameters'] - corpus_name = request_json['corpus'] + corpus_name = request_json['corpus_name'] corpus = Corpus.objects.get(name=corpus_name) task = task_per_type[visualization_type](parameters, visualization_type) @@ -166,7 +171,7 @@ def download_full_data(request_json, user): make_chain = lambda : chain( task, - make_term_frequency_csv.s(parameters, download.id), + make_full_data_csv.s(visualization_type, parameters, download.id), complete_download.s(download.id), csv_data_email.s(user.email, user.username), ).on_error(complete_failed_download.s(download.id)) diff --git a/backend/download/tests/test_csv_results.py b/backend/download/tests/test_csv_results.py index f33a896ef..6abc899e7 100644 --- a/backend/download/tests/test_csv_results.py +++ b/backend/download/tests/test_csv_results.py @@ -208,3 +208,26 @@ def test_date_format(): for value, unit, expected in cases: assert create_csv.format_field_value(value, unit) == expected + + +mock_ngram_data = { + 'words': [ + {'label': 'ex parrot', 'data': [2, 3]}, + {'label': 'this parrot what', 'data': [4, 8]}, + {'label': 'dead parrot when', 'data': [4, 6]}, + ], + 'time_points': ['1960-1965', '1962-1967'] +} + +expected_csv_table = [ + {'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2}, + {'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4}, + {'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4}, + {'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3}, + {'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8}, + {'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6}, +] + +def test_ngram_table(): + table = create_csv.ngram_table(mock_ngram_data) + assert table == expected_csv_table \ No newline at end of file diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py index 6d9993815..42607a238 100644 --- a/backend/download/tests/test_download_views.py +++ b/backend/download/tests/test_download_views.py @@ -6,7 +6,7 @@ from download import SEARCH_RESULTS_DIALECT from addcorpus.models import Corpus import io -from visualization.query import MATCH_ALL +from visualization import query from es.search import hits from tag.models import Tag, TaggedDocument @@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs): # TODO: construct query from query module, which is much more convenient query_text = mock_corpus_specs['example_query'] search_field = mock_corpus_specs['content_field'] - query = { - "query": { - "bool": { - "must": { - "simple_query_string": { - "query": query_text, - "fields": [search_field], - "lenient": True, - "default_operator": "or" - } - }, - "filter": [] - } - } - } + query = mock_es_query(query_text, search_field) return { 'es_query': query, 'corpus_name': mock_corpus, @@ -78,14 +64,40 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs): 'unit': 'year', } +def ngram_parameters(mock_corpus, mock_corpus_specs): + query_text = mock_corpus_specs['example_query'] + search_field = mock_corpus_specs['content_field'] + return { + 'corpus_name': mock_corpus, + 'es_query': mock_es_query(query_text, search_field), + 'field': search_field, + 'ngram_size': 2, + 'term_position': 'any', + 'freq_compensation': True, + 'subfield': 'clean', + 'max_size_per_interval': 50, + 'number_of_ngrams': 10, + 'date_field': 'date' + } + +def mock_es_query(query_text, search_field): + q = query.MATCH_ALL + q = query.set_query_text(q, query_text) + q = query.set_search_fields(q, [search_field]) + return q + +@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)]) def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus, index_small_mock_corpus, small_mock_corpus_specs, celery_worker, - csv_directory): - parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs) + csv_directory, visualization_type, request_parameters): + parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs) + if visualization_type != 'ngram': + # timeline and histogram expect a series of parameters + parameters = [parameters] request_json = { - 'visualization': 'date_term_frequency', - 'parameters': [parameters], - 'corpus': small_mock_corpus + 'visualization': visualization_type, + 'parameters': parameters, + 'corpus_name': small_mock_corpus } response = admin_client.post( '/api/download/full_data', @@ -160,7 +172,7 @@ def test_csv_download_view(admin_client, finished_download): def some_document_id(admin_client, small_mock_corpus, index_small_mock_corpus): search_response = admin_client.post( f'/api/es/{small_mock_corpus}/_search', - {'es_query': MATCH_ALL}, + {'es_query': query.MATCH_ALL}, content_type='application/json' ) @@ -188,7 +200,7 @@ def test_download_with_tag(db, admin_client, small_mock_corpus, index_small_mock encoding = 'utf-8' download_request_json = { 'corpus': small_mock_corpus, - 'es_query': MATCH_ALL, + 'es_query': query.MATCH_ALL, 'tags': [tag_on_some_document.id], 'fields': ['date','content'], 'size': 3, diff --git a/backend/download/tests/test_full_data.py b/backend/download/tests/test_full_data.py index 47e553310..b3334ee39 100644 --- a/backend/download/tests/test_full_data.py +++ b/backend/download/tests/test_full_data.py @@ -21,11 +21,11 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo ], 'unit': 'year' }] - - group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency') + visualization_type = 'date_term_frequency' + group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type) results = group.apply().get() log_id = 0 # fake ID - filename = tasks.make_term_frequency_csv(results, full_data_parameters, log_id) + filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters, log_id) with open(filename) as f: reader = csv.DictReader(f) diff --git a/backend/download/views.py b/backend/download/views.py index 0e40bfa67..3d4a4e326 100644 --- a/backend/download/views.py +++ b/backend/download/views.py @@ -98,10 +98,10 @@ class FullDataDownloadTaskView(APIView): permission_classes = [IsAuthenticated, CorpusAccessPermission] def post(self, request, *args, **kwargs): - check_json_keys(request, ['visualization', 'parameters', 'corpus']) + check_json_keys(request, ['visualization', 'parameters', 'corpus_name']) visualization_type = request.data['visualization'] - known_visualisations = ['date_term_frequency', 'aggregate_term_frequency'] + known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram'] if visualization_type not in known_visualisations: raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"') diff --git a/backend/es/download.py b/backend/es/download.py index d4da32a63..49e2c5614 100644 --- a/backend/es/download.py +++ b/backend/es/download.py @@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size): result = search( corpus = corpus, query_model=query_model, - size = size, + size=size, ) return hits(result) diff --git a/backend/visualization/ngram.py b/backend/visualization/ngram.py index 3e568a612..07d54108b 100644 --- a/backend/visualization/ngram.py +++ b/backend/visualization/ngram.py @@ -5,6 +5,7 @@ from addcorpus.models import CorpusConfiguration from datetime import datetime from es.search import get_index, search +from es.download import scroll from ianalyzer.elasticsearch import elasticsearch from visualization import query, termvectors @@ -50,7 +51,7 @@ def get_total_time_interval(es_query, corpus): def get_time_bins(es_query, corpus): """Wide bins for a query. Depending on the total time range of the query, time intervervals are 10 years (>100 yrs), 5 years (100-20 yrs) of 1 year (<20 yrs).""" - + min_date, max_date = get_total_time_interval(es_query, corpus) min_year, max_year = min_date.year, max_date.year time_range = max_year - min_year @@ -77,9 +78,9 @@ def get_time_bins(es_query, corpus): return bins -def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field): - index = get_index(corpus) - client = elasticsearch(corpus) +def tokens_by_time_interval(corpus_name, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field, **kwargs): + index = get_index(corpus_name) + client = elasticsearch(corpus_name) positions_dict = { 'any': list(range(ngram_size)), 'first': [0], @@ -100,21 +101,21 @@ def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_posit date_filter = query.make_date_filter(start_date, end_date, date_field) narrow_query = query.add_filter(es_query, date_filter) #search for the query text - search_results = search( - corpus=corpus, - query_model = narrow_query, - client = client, - size = max_size_per_interval, + search_results, _total = scroll( + corpus=corpus_name, + query_model=narrow_query, + client=client, + download_size=max_size_per_interval, ) bin_ngrams = Counter() - for hit in search_results['hits']['hits']: + for hit in search_results: identifier = hit['_id'] # get the term vectors for the hit result = client.termvectors( index=index, id=identifier, term_statistics=freq_compensation, - fields = [field] + fields=[field] ) terms = termvectors.get_terms(result, field) if terms: diff --git a/backend/visualization/tasks.py b/backend/visualization/tasks.py index 3b7dcf88a..51658115d 100644 --- a/backend/visualization/tasks.py +++ b/backend/visualization/tasks.py @@ -25,7 +25,7 @@ def ngram_data_tasks(request_json): return chord(group([ get_ngram_data_bin.s( - corpus=corpus, + corpus_name=corpus, es_query=es_query, field=request_json['field'], bin=b, @@ -40,7 +40,7 @@ def ngram_data_tasks(request_json): ]), integrate_ngram_results.s( number_of_ngrams=request_json['number_of_ngrams'] ) - )() + ) @shared_task() def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False): diff --git a/backend/visualization/tests/test_ngrams.py b/backend/visualization/tests/test_ngrams.py index 93de4bbdb..16a797d3d 100644 --- a/backend/visualization/tests/test_ngrams.py +++ b/backend/visualization/tests/test_ngrams.py @@ -111,10 +111,10 @@ def test_top_10_ngrams(): for w in target_data } assert dataset_relative['data'] == relative_frequencies[word] -def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'): +def get_binned_results(corpus_name, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'): return [ ngram.tokens_by_time_interval( - corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field) + corpus_name, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field) for bin in time_bins ] diff --git a/backend/visualization/views.py b/backend/visualization/views.py index 034a7d584..c608b784e 100644 --- a/backend/visualization/views.py +++ b/backend/visualization/views.py @@ -57,7 +57,7 @@ def post(self, request, *args, **kwargs): try: handle_tags_in_request(request) - chord = tasks.ngram_data_tasks(request.data) + chord = tasks.ngram_data_tasks(request.data)() subtasks = [chord, *chord.parent.children] return Response({'task_ids': [task.id for task in subtasks]}) except Exception as e: diff --git a/frontend/src/app/document-view/document-view.component.html b/frontend/src/app/document-view/document-view.component.html index dde15325d..c7505664b 100644 --- a/frontend/src/app/document-view/document-view.component.html +++ b/frontend/src/app/document-view/document-view.component.html @@ -29,7 +29,7 @@
- + diff --git a/frontend/src/app/document-view/document-view.component.spec.ts b/frontend/src/app/document-view/document-view.component.spec.ts index 863dd2631..6fa470a3a 100644 --- a/frontend/src/app/document-view/document-view.component.spec.ts +++ b/frontend/src/app/document-view/document-view.component.spec.ts @@ -41,4 +41,11 @@ describe('DocumentViewComponent', () => { const element = debug[0].nativeElement; expect(element.textContent).toBe('Hello world!'); }); + + it('should create tabs', () => { + const debug = fixture.debugElement.queryAll(By.css('a[role=tab]')); + expect(debug.length).toBe(2); + expect(debug[0].attributes['id']).toBe('tab-speech'); + expect(debug[1].attributes['id']).toBe('tab-scan'); + }); }); diff --git a/frontend/src/app/history/download-history/download-history.component.ts b/frontend/src/app/history/download-history/download-history.component.ts index 24b495aab..bf926fdcb 100644 --- a/frontend/src/app/history/download-history/download-history.component.ts +++ b/frontend/src/app/history/download-history/download-history.component.ts @@ -39,7 +39,8 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit const displayNames = { search_results: 'Search results', date_term_frequency: 'Term frequency', - aggregate_term_frequency: 'Term frequency' + aggregate_term_frequency: 'Term frequency', + ngram: 'Neighbouring words' // timeline/histogram distinction is relevant for backend but not for the user }; return displayNames[type]; diff --git a/frontend/src/app/models/search-results.ts b/frontend/src/app/models/search-results.ts index 150df3c74..a9d5edbff 100644 --- a/frontend/src/app/models/search-results.ts +++ b/frontend/src/app/models/search-results.ts @@ -102,7 +102,7 @@ export type TermFrequencyDownloadParameters = DateTermFrequencyParameters[] | Ag export type LimitedResultsDownloadParameters = ResultsDownloadParameters & { size: number } & DownloadOptions; -export type DownloadType = 'search_results' | 'aggregate_term_frequency' | 'date_term_frequency'; +export type DownloadType = 'search_results' | 'aggregate_term_frequency' | 'date_term_frequency' | 'ngram'; export type DownloadStatus = 'done' | 'working' | 'error'; export type DownloadParameters = TermFrequencyDownloadParameters | ResultsDownloadParameters; diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts index e50e73b0b..47ced1eb4 100644 --- a/frontend/src/app/services/api.service.ts +++ b/frontend/src/app/services/api.service.ts @@ -175,12 +175,18 @@ export class ApiService { | { visualization: 'date_term_frequency'; parameters: DateTermFrequencyParameters[]; - corpus: string; + corpus_name: string; } | { visualization: 'aggregate_term_frequency'; parameters: AggregateTermFrequencyParameters[]; - corpus: string; + corpus_name: string; + } + | + { + visualization: 'ngram'; + parameters: NGramRequestParameters; + corpus_name: string; } ): Promise { const url = this.apiRoute(this.downloadApiURL, 'full_data'); diff --git a/frontend/src/app/services/corpus.service.spec.ts b/frontend/src/app/services/corpus.service.spec.ts index 26364246a..3d4d30dff 100644 --- a/frontend/src/app/services/corpus.service.spec.ts +++ b/frontend/src/app/services/corpus.service.spec.ts @@ -199,6 +199,8 @@ describe('CorpusService', () => { expect(items.length).toBe(1); const corpus = _.first(items); + expect(corpus.scan_image_type).toBe('png'); + const fieldData = [ { description: 'Banking concern to which the report belongs.', @@ -275,6 +277,7 @@ describe('CorpusService', () => { expect(result[key]).toEqual(expected[key]); }); }); + }); }); }); diff --git a/frontend/src/app/services/visualization.service.ts b/frontend/src/app/services/visualization.service.ts index 5b3d4245d..945f379bf 100644 --- a/frontend/src/app/services/visualization.service.ts +++ b/frontend/src/app/services/visualization.service.ts @@ -4,13 +4,13 @@ import { AggregateTermFrequencyParameters, Corpus, DateTermFrequencyParameters, + NGramRequestParameters, NgramParameters, QueryModel, TaskResult, TimeCategory, } from '../models'; import { ApiService } from './api.service'; -import { ElasticSearchService } from './elastic-search.service'; @Injectable({ providedIn: 'root' @@ -71,17 +71,14 @@ export class VisualizationService { }; } - public async dateTermFrequencySearch( - corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {size: number; start_date: Date; end_date?: Date}[], - unit: TimeCategory, - ): Promise { - const params = this.makeDateTermFrequencyParameters(corpus, queryModel, fieldName, bins, unit); - return this.apiService.getDateTermFrequency(params); - } - - getNgramTasks(queryModel: QueryModel, corpus: Corpus, field: string, params: NgramParameters): Promise { + public makeNgramRequestParameters( + corpus: Corpus, + queryModel: QueryModel, + field: string, + params: NgramParameters + ): NGramRequestParameters { const esQuery = queryModel.toEsQuery(); - return this.apiService.ngramTasks({ + return { es_query: esQuery, corpus_name: corpus.name, field, @@ -91,8 +88,21 @@ export class VisualizationService { subfield: params.analysis, max_size_per_interval: params.maxDocuments, number_of_ngrams: params.numberOfNgrams, - date_field: params.dateField, - }); + date_field: params.dateField + }; + } + + public async dateTermFrequencySearch( + corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {size: number; start_date: Date; end_date?: Date}[], + unit: TimeCategory, + ): Promise { + const params = this.makeDateTermFrequencyParameters(corpus, queryModel, fieldName, bins, unit); + return this.apiService.getDateTermFrequency(params); + } + + getNgramTasks(queryModel: QueryModel, corpus: Corpus, field: string, params: NgramParameters): Promise { + const ngramRequestParams = this.makeNgramRequestParameters(corpus, queryModel, field, params); + return this.apiService.ngramTasks(ngramRequestParams); } diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts index 5570daa4f..8d5f391f9 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.ts @@ -87,7 +87,7 @@ export class HistogramComponent extends BarchartDirective im return this.apiService.requestFullData({ visualization: 'aggregate_term_frequency', parameters: paramsPerSeries, - corpus: this.corpus.name, + corpus_name: this.corpus.name, }); } diff --git a/frontend/src/app/visualization/barchart/timeline.component.ts b/frontend/src/app/visualization/barchart/timeline.component.ts index f8e570975..314e6296c 100644 --- a/frontend/src/app/visualization/barchart/timeline.component.ts +++ b/frontend/src/app/visualization/barchart/timeline.component.ts @@ -110,7 +110,7 @@ export class TimelineComponent extends BarchartDirective impl return this.apiService.requestFullData({ visualization: 'date_term_frequency', parameters: paramsPerSeries, - corpus: this.corpus.name, + corpus_name: this.corpus.name, }); } diff --git a/frontend/src/app/visualization/ngram/ngram.component.html b/frontend/src/app/visualization/ngram/ngram.component.html index 80e71570b..52fbb94ec 100644 --- a/frontend/src/app/visualization/ngram/ngram.component.html +++ b/frontend/src/app/visualization/ngram/ngram.component.html @@ -115,3 +115,4 @@ + diff --git a/frontend/src/app/visualization/ngram/ngram.component.ts b/frontend/src/app/visualization/ngram/ngram.component.ts index 5f0c432b9..479f73d0b 100644 --- a/frontend/src/app/visualization/ngram/ngram.component.ts +++ b/frontend/src/app/visualization/ngram/ngram.component.ts @@ -1,7 +1,7 @@ import { Component, ElementRef, EventEmitter, Input, OnChanges, Output, SimpleChanges, ViewChild } from '@angular/core'; import * as _ from 'lodash'; import { Corpus, FreqTableHeaders, QueryModel, CorpusField, NgramResults, NgramParameters } from '../../models'; -import { ApiService, ParamService, VisualizationService } from '../../services'; +import { ApiService, NotificationService, ParamService, VisualizationService } from '../../services'; import { faCheck, faTimes } from '@fortawesome/free-solid-svg-icons'; import { ParamDirective } from '../../param/param-directive'; import { ActivatedRoute, ParamMap, Params, Router } from '@angular/router'; @@ -58,9 +58,10 @@ export class NgramComponent extends ParamDirective implements OnChanges { constructor( private apiService: ApiService, private visualizationService: VisualizationService, + private notificationService: NotificationService, route: ActivatedRoute, router: Router, - paramService: ParamService + paramService: ParamService, ) { super(route, router, paramService); this.currentParameters = new NgramParameters( @@ -251,4 +252,30 @@ export class NgramComponent extends ParamDirective implements OnChanges { return `${value}`; } + + requestFullData() { + const parameters = this.visualizationService.makeNgramRequestParameters( + this.corpus, + this.queryModel, + this.visualizedField.name, + this.currentParameters + ); + this.apiService.requestFullData({ + corpus_name: this.corpus.name, + visualization: 'ngram', + parameters + }).then(() => + this.notificationService.showMessage( + 'Full data requested! You will receive an email when your download is ready.', + 'success', + { + text: 'view downloads', + route: ['/download-history'] + } + ) + ).catch(error => { + console.error(error); + this.notificationService.showMessage('Could not set up data generation.', 'danger'); + }); + } }