Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/full ngram #1289

Merged
merged 19 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion backend/download/create_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,21 @@ def format_field_value(value, unit):
'week': '%Y-%m-%d',
'day': '%Y-%m-%d'
}
return date.strftime(formats[unit])
return date.strftime(formats[unit])

def ngram_csv(results, filename):
rows = ngram_table(results)
fieldnames = ['date', 'N-gram', 'Frequency']
filepath = write_file(filename, fieldnames, rows)
return filepath

def ngram_table(results):
rows = []
for index, time_point in enumerate(results['time_points']):
for ngram in results['words']:
rows.append({
'date': time_point,
'N-gram': ngram['label'],
'Frequency': ngram['data'][index]
})
return rows
18 changes: 18 additions & 0 deletions backend/download/migrations/0002_alter_download_download_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.10 on 2023-10-18 12:52

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('download', '0001_initial'),
]

operations = [
migrations.AlterField(
model_name='download',
name='download_type',
field=models.CharField(choices=[('search_results', 'Search results'), ('date_term_frequency', 'Term frequency (timeline)'), ('aggregate_term_frequency', 'Term frequency (histogram)'), ('ngram', 'Neighbouring words')], help_text='Type of download (search results or a type of visualisation)', max_length=126),
),
]
8 changes: 5 additions & 3 deletions backend/download/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from django.db import models
from django.conf import settings
from django.utils import timezone

from users.models import CustomUser
from addcorpus.models import Corpus
from django.conf import settings
from datetime import datetime

MAX_LENGTH_FILENAME = 254

Expand All @@ -17,6 +18,7 @@ class Download(models.Model):
('search_results', 'Search results'),
('date_term_frequency', 'Term frequency (timeline)'),
('aggregate_term_frequency', 'Term frequency (histogram)'),
('ngram', 'Neighbouring words')
],
help_text='Type of download (search results or a type of visualisation)')
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, to_field='name', related_name='downloads')
Expand Down Expand Up @@ -49,5 +51,5 @@ def complete(self, filename = None):
'''

self.filename = filename
self.completed = datetime.now()
self.completed = timezone.now()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datetime.now() generates runtime errors, as it's not timezone aware.

self.save()
18 changes: 12 additions & 6 deletions backend/download/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from download import create_csv
from download.models import Download
from addcorpus.models import Corpus
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks
from visualization.tasks import histogram_term_frequency_tasks, timeline_term_frequency_tasks, ngram_data_tasks
from visualization import query
from download.mail import send_csv_email

Expand Down Expand Up @@ -90,10 +90,12 @@ def download_search_results(request_json, user):
return try_download(make_chain, download)

@shared_task()
def make_term_frequency_csv(results_per_series, parameters_per_series):
def make_full_data_csv(results_per_series, visualization_type, parameters_per_series):
'''
Export term frequency results to a csv.
'''
if visualization_type == 'ngram':
return create_csv.ngram_csv(results_per_series)
query_per_series, field_name, unit = extract_term_frequency_download_metadata(parameters_per_series)
return create_csv.term_frequency_csv(query_per_series, results_per_series, field_name, unit = unit)

Expand All @@ -110,6 +112,10 @@ def term_frequency_full_data_tasks(parameters_per_series, visualization_type):
task_function(series_parameters, True) for series_parameters in parameters_unlimited
)

def ngram_full_data_tasks(ngram_parameters, dummy):
ngram_parameters['max_size_per_interval'] = None
return ngram_data_tasks(ngram_parameters)

def extract_term_frequency_download_metadata(parameters_per_series):
'''
Get some relevant metadata for a term frequency request:
Expand Down Expand Up @@ -148,16 +154,16 @@ def download_full_data(request_json, user):
'''
Download the full data for a visualisation
'''

visualization_type = request_json['visualization']

task_per_type = {
'date_term_frequency': term_frequency_full_data_tasks,
'aggregate_term_frequency': term_frequency_full_data_tasks
'aggregate_term_frequency': term_frequency_full_data_tasks,
'ngram': ngram_full_data_tasks,
}

parameters = request_json['parameters']
corpus_name = request_json['corpus']
corpus_name = request_json['corpus_name']
corpus = Corpus.objects.get(name=corpus_name)
task = task_per_type[visualization_type](parameters, visualization_type)

Expand All @@ -166,7 +172,7 @@ def download_full_data(request_json, user):

make_chain = lambda : chain(
task,
make_term_frequency_csv.s(parameters),
make_full_data_csv.s(visualization_type, parameters),
complete_download.s(download.id),
csv_data_email.s(user.email, user.username),
).on_error(complete_failed_download.s(download.id))
Expand Down
23 changes: 23 additions & 0 deletions backend/download/tests/test_csv_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,26 @@ def test_date_format():

for value, unit, expected in cases:
assert create_csv.format_field_value(value, unit) == expected


mock_ngram_data = {
'words': [
{'label': 'ex parrot', 'data': [2, 3]},
{'label': 'this parrot what', 'data': [4, 8]},
{'label': 'dead parrot when', 'data': [4, 6]},
],
'time_points': ['1960-1965', '1962-1967']
}

expected_csv_table = [
{'date': '1960-1965', 'N-gram': 'ex parrot', 'Frequency': 2},
{'date': '1960-1965', 'N-gram': 'this parrot what', 'Frequency': 4},
{'date': '1960-1965', 'N-gram': 'dead parrot when', 'Frequency': 4},
{'date': '1962-1967', 'N-gram': 'ex parrot', 'Frequency': 3},
{'date': '1962-1967', 'N-gram': 'this parrot what', 'Frequency': 8},
{'date': '1962-1967', 'N-gram': 'dead parrot when', 'Frequency': 6},
]

def test_ngram_table():
table = create_csv.ngram_table(mock_ngram_data)
assert table == expected_csv_table
63 changes: 43 additions & 20 deletions backend/download/tests/test_download_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,7 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
# TODO: construct query from query module, which is much more convenient
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
query = {
"query": {
"bool": {
"must": {
"simple_query_string": {
"query": query_text,
"fields": [search_field],
"lenient": True,
"default_operator": "or"
}
},
"filter": []
}
}
}
query = mock_es_query(query_text, search_field)
return {
'es_query': query,
'corpus_name': mock_corpus,
Expand All @@ -78,14 +64,51 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs):
'unit': 'year',
}

def ngram_parameters(mock_corpus, mock_corpus_specs):
query_text = mock_corpus_specs['example_query']
search_field = mock_corpus_specs['content_field']
return {
'corpus_name': mock_corpus,
'es_query': mock_es_query(query_text, search_field),
'field': search_field,
'ngram_size': 2,
'term_position': 'any',
'freq_compensation': True,
'subfield': 'clean',
'max_size_per_interval': 50,
'number_of_ngrams': 10,
'date_field': 'date'
}

def mock_es_query(query_text, search_field):
return {
"query": {
"bool": {
"must": {
"simple_query_string": {
"query": query_text,
"fields": [search_field],
"lenient": True,
"default_operator": "or"
}
},
"filter": []
}
}
}
BeritJanssen marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.parametrize("visualization_type, request_parameters", [('date_term_frequency', term_frequency_parameters), ('ngram', ngram_parameters)])
def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus,
index_small_mock_corpus, small_mock_corpus_specs, celery_worker,
csv_directory):
parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs)
csv_directory, visualization_type, request_parameters):
parameters = request_parameters(small_mock_corpus, small_mock_corpus_specs)
if visualization_type != 'ngram':
# timeline and histogram expect a series of parameters
parameters = [parameters]
request_json = {
'visualization': 'date_term_frequency',
'parameters': [parameters],
'corpus': small_mock_corpus
'visualization': visualization_type,
'parameters': parameters,
'corpus_name': small_mock_corpus
}
response = admin_client.post(
'/api/download/full_data',
Expand Down
6 changes: 3 additions & 3 deletions backend/download/tests/test_full_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mo
],
'unit': 'year'
}]

group = tasks.term_frequency_full_data_tasks(full_data_parameters, 'date_term_frequency')
visualization_type = 'date_term_frequency'
group = tasks.term_frequency_full_data_tasks(full_data_parameters, visualization_type)
results = group.apply().get()
filename = tasks.make_term_frequency_csv(results, full_data_parameters)
filename = tasks.make_full_data_csv(results, visualization_type, full_data_parameters)

with open(filename) as f:
reader = csv.DictReader(f)
Expand Down
4 changes: 2 additions & 2 deletions backend/download/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ class FullDataDownloadTaskView(APIView):
permission_classes = [IsAuthenticated, CorpusAccessPermission]

def post(self, request, *args, **kwargs):
check_json_keys(request, ['visualization', 'parameters', 'corpus'])
check_json_keys(request, ['visualization', 'parameters', 'corpus_name'])

visualization_type = request.data['visualization']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency']
known_visualisations = ['date_term_frequency', 'aggregate_term_frequency', 'ngram']
if visualization_type not in known_visualisations:
raise ParseError(f'Download failed: unknown visualisation type "{visualization_type}"')

Expand Down
2 changes: 1 addition & 1 deletion backend/es/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ def normal_search(corpus, query_model, size):
result = search(
corpus = corpus,
query_model=query_model,
size = size,
size=size,
)
return hits(result)
23 changes: 12 additions & 11 deletions backend/visualization/ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from addcorpus.models import CorpusConfiguration
from datetime import datetime
from es.search import get_index, search
from es.download import scroll
from ianalyzer.elasticsearch import elasticsearch
from visualization import query, termvectors

Expand Down Expand Up @@ -50,7 +51,7 @@ def get_total_time_interval(es_query, corpus):
def get_time_bins(es_query, corpus):
"""Wide bins for a query. Depending on the total time range of the query, time intervervals are
10 years (>100 yrs), 5 years (100-20 yrs) of 1 year (<20 yrs)."""

min_date, max_date = get_total_time_interval(es_query, corpus)
min_year, max_year = min_date.year, max_date.year
time_range = max_year - min_year
Expand All @@ -77,9 +78,9 @@ def get_time_bins(es_query, corpus):
return bins


def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field):
index = get_index(corpus)
client = elasticsearch(corpus)
def tokens_by_time_interval(corpus_name, es_query, field, bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field, **kwargs):
index = get_index(corpus_name)
client = elasticsearch(corpus_name)
positions_dict = {
'any': list(range(ngram_size)),
'first': [0],
Expand All @@ -100,21 +101,21 @@ def tokens_by_time_interval(corpus, es_query, field, bin, ngram_size, term_posit
date_filter = query.make_date_filter(start_date, end_date, date_field)
narrow_query = query.add_filter(es_query, date_filter)
#search for the query text
search_results = search(
corpus=corpus,
query_model = narrow_query,
client = client,
size = max_size_per_interval,
search_results, _total = scroll(
corpus=corpus_name,
query_model=narrow_query,
client=client,
download_size=max_size_per_interval,
)
bin_ngrams = Counter()
for hit in search_results['hits']['hits']:
for hit in search_results:
identifier = hit['_id']
# get the term vectors for the hit
result = client.termvectors(
index=index,
id=identifier,
term_statistics=freq_compensation,
fields = [field]
fields=[field]
)
terms = termvectors.get_terms(result, field)
if terms:
Expand Down
2 changes: 1 addition & 1 deletion backend/visualization/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def ngram_data_tasks(request_json):
]), integrate_ngram_results.s(
number_of_ngrams=request_json['number_of_ngrams']
)
)()
)

@shared_task()
def get_histogram_term_frequency_bin(es_query, corpus_name, field_name, field_value, size, include_query_in_result = False):
Expand Down
4 changes: 2 additions & 2 deletions backend/visualization/tests/test_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def test_top_10_ngrams():
for w in target_data }
assert dataset_relative['data'] == relative_frequencies[word]

def get_binned_results(corpus, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
def get_binned_results(corpus_name, query, time_bins=CENTURY_BINS, ngram_size=2, term_position='any', freq_compensation=None, subfield='none', max_size_per_interval=20, date_field='date'):
return [
ngram.tokens_by_time_interval(
corpus, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
corpus_name, query, 'content', bin, ngram_size, term_position, freq_compensation, subfield, max_size_per_interval, date_field)
for bin in time_bins
]

Expand Down
2 changes: 1 addition & 1 deletion backend/visualization/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def post(self, request, *args, **kwargs):

try:
handle_tags_in_request(request)
chord = tasks.ngram_data_tasks(request.data)
chord = tasks.ngram_data_tasks(request.data)()
subtasks = [chord, *chord.parent.children]
return Response({'task_ids': [task.id for task in subtasks]})
except Exception as e:
Expand Down
10 changes: 8 additions & 2 deletions frontend/src/app/services/api.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,18 @@ export class ApiService {
| {
visualization: 'date_term_frequency';
parameters: DateTermFrequencyParameters[];
corpus: string;
corpus_name: string;
}
| {
visualization: 'aggregate_term_frequency';
parameters: AggregateTermFrequencyParameters[];
corpus: string;
corpus_name: string;
}
|
{
visualization: 'ngram';
parameters: NGramRequestParameters;
corpus_name: string;
}
): Promise<TaskResult> {
const url = this.apiRoute(this.downloadApiURL, 'full_data');
Expand Down
Loading