Skip to content

Commit

Permalink
Merge pull request #1501 from UUDigitalHumanitieslab/feature/default-…
Browse files Browse the repository at this point in the history
…sort

Feature/default sort
  • Loading branch information
lukavdplas authored Mar 12, 2024
2 parents e7045ac + 5a654cd commit c7a228d
Show file tree
Hide file tree
Showing 22 changed files with 130 additions and 43 deletions.
2 changes: 1 addition & 1 deletion backend/addcorpus/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class CorpusConfigurationAdmin(admin.ModelAdmin):
'min_date',
'max_date',
'document_context',
'default_sort',
]
}
), (
Expand Down Expand Up @@ -120,7 +121,6 @@ class FieldAdmin(admin.ModelAdmin):
'searchable',
'search_field_core',
'sortable',
'primary_sort',
]
}
), (
Expand Down
17 changes: 15 additions & 2 deletions backend/addcorpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,21 @@ def es_index(self):
'context_display_name': None
}

'''
Specifies a default configuration for sorting search results.
The value should be a dictionary that specifies `'field'` and `'ascending'`, e.g.:
`{'field': 'date', 'ascending': True }`
The field must be the `name` of a sortable field in the corpus.
This configuration is used when sorting search results when the query context is
empty, i.e. the user has not entered query text. Results from a query are always
sorted on relevance by default.
'''
default_sort = {}

@property
def image(self):
'''
Expand Down Expand Up @@ -328,7 +343,6 @@ def __init__(self,
search_filter=None,
extractor=extract.Constant(None),
sortable=None,
primary_sort=False,
searchable=None,
downloadable=True,
required=False,
Expand Down Expand Up @@ -361,7 +375,6 @@ def __init__(self,
not hidden and indexed and \
mapping_type in ['integer', 'float', 'date']

self.primary_sort = primary_sort

# Fields are searchable if they are not hidden and if they are mapped as 'text'.
# Keyword fields without a filter are also searchable.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.10 on 2024-03-11 10:44

import addcorpus.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0009_corpusconfiguration_citation_page'),
]

operations = [
migrations.RemoveField(
model_name='field',
name='primary_sort',
),
migrations.AddField(
model_name='corpusconfiguration',
name='default_sort',
field=models.JSONField(blank=True, default=dict, help_text='default sort for search results without query text; if blank, results are presented in the order in which they are stored', validators=[addcorpus.validators.validate_sort_configuration]),
),
]
23 changes: 14 additions & 9 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
import warnings

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validators import validate_language_code, validate_image_filename_extension, \
validate_markdown_filename_extension, validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication, any_date_fields, visualisations_require_date_field
from addcorpus.validators import validate_language_code, \
validate_image_filename_extension, validate_markdown_filename_extension, \
validate_es_mapping, validate_mimetype, validate_search_filter, \
validate_name_is_not_a_route_parameter, validate_search_filter_with_mapping, \
validate_searchable_field_has_full_text_search, \
validate_visualizations_with_mapping, validate_implication, any_date_fields, \
visualisations_require_date_field, validate_sort_configuration

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
Expand Down Expand Up @@ -130,6 +133,13 @@ class CorpusConfiguration(models.Model):
default=False,
help_text='whether this corpus has word models',
)
default_sort = models.JSONField(
blank=True,
validators=[validate_sort_configuration],
default=dict,
help_text='default sort for search results without query text; '
'if blank, results are presented in the order in which they are stored',
)

def __str__(self):
return f'Configuration of <{self.corpus.name}>'
Expand Down Expand Up @@ -239,10 +249,6 @@ class Field(models.Model):
default=False,
help_text='whether search results can be sorted on this field',
)
primary_sort = models.BooleanField(
default=False,
help_text='if sortable: whether this is the default method of sorting search results',
)
searchable = models.BooleanField(
default=False,
help_text='whether this field is listed when selecting search fields',
Expand Down Expand Up @@ -270,7 +276,6 @@ def clean(self):
if self.visualizations:
validate_visualizations_with_mapping(self.es_mapping, self.visualizations)

validate_implication(self.primary_sort, self.sortable, "The primary sorting field must be sortable")
validate_implication(self.csv_core, self.downloadable, "Core download fields must be downloadable")

# core search fields must searchable
Expand Down
3 changes: 2 additions & 1 deletion backend/addcorpus/save_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _copy_corpus_attributes(corpus_definition: CorpusDefinition, configuration:
'scan_image_type',
'title',
'word_models_present',
'default_sort',
]

try:
Expand All @@ -68,7 +69,7 @@ def _save_field_in_database(field_definition: FieldDefinition, configuration: Co
'csv_core', 'search_field_core',
'visualizations', 'visualization_sort',
'es_mapping', 'indexed', 'hidden',
'required', 'sortable', 'primary_sort',
'required', 'sortable',
'searchable', 'downloadable'
]

Expand Down
3 changes: 2 additions & 1 deletion backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class Meta:
'hidden',
'required',
'sortable',
'primary_sort',
'searchable',
'downloadable',
]
Expand Down Expand Up @@ -63,6 +62,7 @@ class CorpusConfigurationSerializer(serializers.ModelSerializer):
fields = FieldSerializer(many=True, read_only=True)
languages = serializers.ListField(child=LanguageField())
category = PrettyChoiceField(choices=CATEGORIES)
default_sort = NonEmptyJSONField()

class Meta:
model = CorpusConfiguration
Expand All @@ -82,6 +82,7 @@ class Meta:
'scan_image_type',
'title',
'word_models_present',
'default_sort',
'fields',
]

Expand Down
14 changes: 14 additions & 0 deletions backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,17 @@ def test_validate_ngram_has_date_field():
visualisations_require_date_field,
any_date_fields
)

def test_validate_sort_configuration():
validate_sort_configuration({})

validate_sort_configuration({
'field': 'date',
'ascending': False
})

with pytest.raises(ValidationError):
validate_sort_configuration({
'field': 'date',
'ascending': None
})
17 changes: 17 additions & 0 deletions backend/addcorpus/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,20 @@ def any_date_fields(fields):

def visualisations_require_date_field(visualisations):
return visualisations and 'ngram' in visualisations

def validate_sort_configuration(sort_config):
'''
Validates that the object is a sort configuration
'''

if not sort_config:
return

field = sort_config.get('field', None)
ascending = sort_config.get('ascending', None)

if type(field) is not str:
raise ValidationError(f'Sort configuration has invalid "field" property: {field}')

if type(ascending) is not bool:
raise ValidationError(f'Sort configuration has invalid "ascending" property: {ascending}')
3 changes: 2 additions & 1 deletion backend/corpora/parliament/denmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class ParliamentDenmark(Parliament, CSVCorpusDefinition):
'sort_direction': 'asc',
}

default_sort = {'field': 'date_latest', 'ascending': False}

def sources(self, start, end):
logger = logging.getLogger('indexing')

Expand Down Expand Up @@ -83,7 +85,6 @@ def sources(self, start, end):
field='year',
transform= lambda value: formatting.get_date_from_year(value, 'latest')
)
date_latest.primary_sort = True
date_latest.search_filter.lower = min_date
date_latest.search_filter.upper = max_date

Expand Down
6 changes: 4 additions & 2 deletions backend/corpora/parliament/finland-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ def sources(self, start, end):

document_context = document_context()

default_sort = {'field': 'date_latest', 'ascending': False}


chamber = field_defaults.chamber()
chamber.extractor = CSV(field='estate')
chamber.search_filter = MultipleChoiceFilter(
description='Search only in debates from the selected chamber(s)',
option_count=4
)

country = field_defaults.country()
country.extractor = Constant('Finland')

Expand All @@ -52,7 +55,6 @@ def sources(self, start, end):
field='year_end',
transform=lambda value: formatting.get_date_from_year(value, 'latest')
)
date_latest.primary_sort = True
date_latest.search_filter.lower = min_date
date_latest.search_filter.upper = max_date

Expand Down
4 changes: 4 additions & 0 deletions backend/corpora/parliament/norway.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ class ParliamentNorway(Parliament, CSVCorpusDefinition):
context_fields=['book_id'],
context_display_name='book',
)
default_sort = {
'field': 'date_latest',
'ascending': False,
}

def sources(self, start, end):
for csv_file in glob('{}/**/*.csv'.format(self.data_directory), recursive=True):
Expand Down
2 changes: 2 additions & 0 deletions backend/corpora/parliament/parliament.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class Parliament(CorpusDefinition):

category = 'parliament'

default_sort = {'field': 'date', 'ascending': False}

@property
def es_settings(self):
return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)
Expand Down
4 changes: 4 additions & 0 deletions backend/corpora/parliament/sweden-old.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class ParliamentSwedenOld(Parliament, CSVCorpusDefinition):
document_context = constants.document_context(
context_fields=['chamber', 'date_earliest', 'date_latest']
)
default_sort = {
'field': 'date_latest',
'ascending': False,
}

def sources(self, start, end):
for csv_file in sorted(glob('{}/**/*.csv'.format(self.data_directory), recursive=True)):
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/parliament/utils/field_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def date():
description='Search only within this time range.'
),
visualizations=['resultscount', 'termfrequency'],
primary_sort=True,
csv_core=True,
)

Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/rechtspraak/rechtspraak.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class Rechtspraak(XMLCorpusDefinition):
toplevel_zip_file = 'OpenDataUitspraken.zip'
languages = ['nl']
category = 'ruling'
default_sort = {'field': 'date', 'ascending': False}

@property
def es_settings(self):
Expand Down Expand Up @@ -177,7 +178,6 @@ def sources(self, min_date: Optional[int] = None, max_date: Optional[int] = None
extractor=rdf_description_extractor('dcterms:date'),
es_mapping={'type': 'date', 'format': 'yyyy-MM-dd'},
results_overview=True,
primary_sort=True,
csv_core=True,
search_filter=filters.DateFilter(
min_date,
Expand Down
2 changes: 1 addition & 1 deletion documentation/Defining-corpus-fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ If a field includes the `'resultscount'` and/or `'termfrequency'` visualisations

`csv_core` determines if a field is included in the CSV download of search results by default.

`sortable` determines whether a field should appear as a sort option. Optionally, you can specify a sortable field that should be used as the default sorting if there is no query (with a query, results are sorted by relevance by default). Set `primary_sort = True` for that field. (Setting primary_sort for more than one field will only affect the first in the list.)
`sortable` determines whether a field should appear as a sort option.
1 change: 1 addition & 0 deletions documentation/How-to-add-a-new-corpus-to-Ianalyzer.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ The following properties are optional:
- `allow_image_download`
- `desription_page`: filename of markdown document with a comprehensive description, located in a subdirectory `description` of the corpus definition directory.
- `document_context`: specifies fields that define the natural grouping of documents.
- `default_sort`: specifies the default method to sort search result.

The corpus class should also define a function `sources(self, start, end)` which iterates source files (presumably within on `data_directory`). The `start` and `end` properties define a date range: if possible, only yield files within the range. Each source file should be tuple of a filename and a dict with metadata.

Expand Down
1 change: 0 additions & 1 deletion frontend/src/app/download/download.component.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ export class DownloadComponent implements OnChanges {
csvCore: false,
hidden: false,
sortable: false,
primarySort: false,
searchable: false,
downloadable: true,
filterOptions: null,
Expand Down
12 changes: 3 additions & 9 deletions frontend/src/app/models/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as _ from 'lodash';
import { AdHocFilter, BooleanFilter, DateFilter, MultipleChoiceFilter, RangeFilter, SearchFilter } from './field-filter';
import { FieldFilterOptions } from './field-filter-options';
import { SortBy, SortState } from './sort';
import { SortState } from './sort';
import { Store } from '../store/types';
import { SimpleStore } from '../store/simple-store';

Expand Down Expand Up @@ -35,6 +35,7 @@ export class Corpus implements ElasticSearchIndex {
public citationPage?: string,
public documentContext?: DocumentContext,
public new_highlight?: boolean,
public defaultSort?: SortState,
) { }

get minYear(): number {
Expand All @@ -48,11 +49,6 @@ export class Corpus implements ElasticSearchIndex {
get displayLanguages(): string {
return this.languages.join(', '); // may have to truncate long lists?
}

get defaultSort(): SortState {
const sortBy: SortBy = this.fields.find(field => field.primarySort);
return [sortBy, 'desc'];
}
}

export interface ElasticSearchIndex {
Expand All @@ -67,6 +63,7 @@ export interface DocumentContext {
displayName: string;
}


export type FieldDisplayType = 'text_content' | 'px' | 'keyword' | 'integer' | 'text' | 'date' | 'boolean';

/** Corpus field info as sent by the backend api */
Expand All @@ -87,7 +84,6 @@ export interface ApiCorpusField {
hidden: boolean;
required: boolean;
sortable: boolean;
primary_sort: boolean;
searchable: boolean;
downloadable: boolean;
}
Expand All @@ -109,7 +105,6 @@ export class CorpusField {
positionsOffsets?: boolean;
hidden: boolean;
sortable: boolean;
primarySort: boolean;
searchable: boolean;
downloadable: boolean;
name: string;
Expand All @@ -131,7 +126,6 @@ export class CorpusField {
this.positionsOffsets = data['es_mapping']?.term_vector ? true : false;
this.hidden = data.hidden;
this.sortable = data.sortable;
this.primarySort = data.primary_sort;
this.searchable = data.searchable;
this.downloadable = data.downloadable;
this.name = data.name;
Expand Down
Loading

0 comments on commit c7a228d

Please sign in to comment.