From 13077b737d68336701770ecaf26d4d104843eaeb Mon Sep 17 00:00:00 2001 From: Daniel Gray Date: Tue, 16 Jul 2024 08:27:01 +0200 Subject: [PATCH 1/8] added projects to search filters updated tests --- app/general/filters.py | 36 +++++++++++++++++++++++++++----- app/general/tests/test_filter.py | 4 ++-- app/templates/app/search.html | 21 +++++++++++++++++-- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/app/general/filters.py b/app/general/filters.py index 5f3ecbc4..26d20e97 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -1,10 +1,15 @@ import django_filters from django import forms -from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank +from django.contrib.postgres.search import ( + SearchHeadline, + SearchQuery, + SearchRank, + SearchVector, +) from django.db.models import F from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter -from general.models import DocumentFile, Institution, Language, Subject +from general.models import DocumentFile, Institution, Language, Project, Subject class DocumentFileFilter(django_filters.FilterSet): @@ -35,8 +40,27 @@ class Meta: def filter_queryset(self, queryset): queryset = super().filter_queryset(queryset) - search = self.form.cleaned_data.get("search", "") - queue = SearchQuery(search.strip()) + search = self.form.cleaned_data.get("search", "").strip() + queue = SearchQuery(search) + + search_document_files = ( + DocumentFile.objects.annotate( + search=SearchVector("title") + SearchVector("description"), + search_headline=SearchHeadline("description", queue), + ) + .filter(search=SearchQuery(search)) + .only("title", "description") + ) + + project_query = ( + Project.objects.annotate( + search=SearchVector("name") + SearchVector("description"), + search_headline=SearchHeadline("description", queue), + ) + .filter(search=SearchQuery(search)) + .defer("institution_id", "subjects", "languages") + ) + search_rank = SearchRank(F("search_vector"), queue) search_headline = SearchHeadline("document_data", queue) queryset = ( @@ -48,7 +72,9 @@ def filter_queryset(self, queryset): .select_related("institution") ).order_by("-rank") - return queryset + combined_results = list(project_query) + list(search_document_files) + list(queryset) + + return combined_results def filter_search(self, queryset, name, value): if value: diff --git a/app/general/tests/test_filter.py b/app/general/tests/test_filter.py index 1cc5650c..c9d0ca92 100644 --- a/app/general/tests/test_filter.py +++ b/app/general/tests/test_filter.py @@ -77,13 +77,13 @@ def test_combined_filters(self): def test_search_filter(self): data = {"search": "Document"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 2) + self.assertEqual(len(filter.qs), 4) self.assertIn(self.doc1, filter.qs) self.assertIn(self.doc2, filter.qs) data = {"search": "Document 1"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) + self.assertEqual(len(filter.qs), 2) self.assertIn(self.doc1, filter.qs) diff --git a/app/templates/app/search.html b/app/templates/app/search.html index ea46519f..15081352 100644 --- a/app/templates/app/search.html +++ b/app/templates/app/search.html @@ -22,6 +22,23 @@
{% trans "Search a term" %}
-

{{ document.title }}

- {{ document.institution }} -
- {% trans "Excerpt:" %} - … {{ document.search_headline|safe }} … -
-
- {% trans "License:" %} - {{ document.license }} -
-
-{{ document.languages.all |join:", " }} -
-
-{{ document.subjects.all |join:", " }} -
-
+

{{ result.heading }}

+

{{ result.description | truncatewords:10 }}

+

+ {% trans "Excerpt:" %} + {{ result.search_headline|safe }} +

+

+ {{ result.rank }} +

{% endfor %} From cd28611d0f3183e2bff93cc195083ea0d8acad9c Mon Sep 17 00:00:00 2001 From: Friedel Wolff Date: Tue, 23 Jul 2024 16:59:17 +0200 Subject: [PATCH 3/8] First stab at search weighting and normalization --- app/general/filters.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/app/general/filters.py b/app/general/filters.py index 7a1ce4a9..73f195e4 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -7,7 +7,7 @@ SearchVector, ) from django.db.models import F, Value -from django.db.models.functions import Left +from django.db.models.functions import Greatest, Left from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter from general.models import DocumentFile, Institution, Language, Project, Subject @@ -39,6 +39,8 @@ class Meta: ] def filter_queryset(self, queryset): + # More information about weighting and normalization in postgres: + # https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING queryset = super().filter_queryset(queryset) search = self.form.cleaned_data.get("search", "").strip() @@ -50,13 +52,15 @@ def filter_queryset(self, queryset): # In the queries below, any differences between models must be fixed # through e.g. `Value` or `F` annotations. - project_search_vector = SearchVector("name") + SearchVector("description") + project_search_vector = SearchVector("name", weight="A") + SearchVector( + "description", weight="B" + ) project_query = ( Project.objects.annotate( heading=F("name"), view=Value("project_detail"), search_headline=SearchHeadline("description", query), - rank=SearchRank(project_search_vector, query), + rank=SearchRank(project_search_vector, query, normalization=16), search=project_search_vector, ) .filter(search=SearchQuery(search)) @@ -66,11 +70,19 @@ def filter_queryset(self, queryset): # We limit the headline to limit the performance impact. On very large # documents, this slows things down if unconstrained. search_headline = SearchHeadline(Left("document_data", 200_000), query) - search_rank = SearchRank(F("search_vector"), query) + doc_search_rank = SearchRank(F("search_vector"), query) + # While the search_vector field doesn't provide weighted vectors, we + # work around that to ensure that hits in the title or description are + # still ranked high. See `doc_rank` and `rank` below. + extra_search_vector = SearchVector("title", weight="A") + SearchVector( + "description", weight="B" + ) + search_rank = SearchRank(extra_search_vector, query, normalization=16) queryset = queryset.annotate( heading=F("title"), view=Value("document_detail"), - rank=search_rank, + doc_rank=doc_search_rank, + rank=Greatest(F("doc_rank"), search_rank), search_headline=search_headline, ).values(*fields) return queryset.union(project_query).order_by("-rank") From c2fff8538d134ba445f93f446d9bbf42c0769ad5 Mon Sep 17 00:00:00 2001 From: Friedel Wolff Date: Wed, 24 Jul 2024 15:14:58 +0200 Subject: [PATCH 4/8] Small cleanups --- app/general/filters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/general/filters.py b/app/general/filters.py index 73f195e4..2dded844 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -63,7 +63,7 @@ def filter_queryset(self, queryset): rank=SearchRank(project_search_vector, query, normalization=16), search=project_search_vector, ) - .filter(search=SearchQuery(search)) + .filter(search=query) .values(*fields) ) @@ -89,8 +89,8 @@ def filter_queryset(self, queryset): def filter_search(self, queryset, name, value): if value: - queue = SearchQuery(value.strip()) - return queryset.filter(search_vector=queue) + query = SearchQuery(value.strip()) + return queryset.filter(search_vector=query) else: return queryset From 8d9799828ff3efec169bb3784c90fa147f5be37a Mon Sep 17 00:00:00 2001 From: Friedel Wolff Date: Wed, 24 Jul 2024 15:19:26 +0200 Subject: [PATCH 5/8] Store weights in generated documentfile.search_vector This stores the correct weighted vector, and includes the document description that was missing. --- app/general/admin.py | 1 - app/general/filters.py | 12 ++----- .../0012_alter_documentfile_search_vector.py | 32 +++++++++++++++++++ app/general/models.py | 16 ++++++++-- 4 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 app/general/migrations/0012_alter_documentfile_search_vector.py diff --git a/app/general/admin.py b/app/general/admin.py index a91edac3..e22ede8d 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -17,7 +17,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields["document_data"].widget = HiddenInput() - self.fields["search_vector"].widget = HiddenInput() # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: diff --git a/app/general/filters.py b/app/general/filters.py index 2dded844..b49adddf 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -70,19 +70,11 @@ def filter_queryset(self, queryset): # We limit the headline to limit the performance impact. On very large # documents, this slows things down if unconstrained. search_headline = SearchHeadline(Left("document_data", 200_000), query) - doc_search_rank = SearchRank(F("search_vector"), query) - # While the search_vector field doesn't provide weighted vectors, we - # work around that to ensure that hits in the title or description are - # still ranked high. See `doc_rank` and `rank` below. - extra_search_vector = SearchVector("title", weight="A") + SearchVector( - "description", weight="B" - ) - search_rank = SearchRank(extra_search_vector, query, normalization=16) + search_rank = SearchRank(F("search_vector"), query, normalization=16) queryset = queryset.annotate( heading=F("title"), view=Value("document_detail"), - doc_rank=doc_search_rank, - rank=Greatest(F("doc_rank"), search_rank), + rank=search_rank, search_headline=search_headline, ).values(*fields) return queryset.union(project_query).order_by("-rank") diff --git a/app/general/migrations/0012_alter_documentfile_search_vector.py b/app/general/migrations/0012_alter_documentfile_search_vector.py new file mode 100644 index 00000000..2bde464c --- /dev/null +++ b/app/general/migrations/0012_alter_documentfile_search_vector.py @@ -0,0 +1,32 @@ +import django.contrib.postgres.search +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ('general', '0011_alter_documentfile_options_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='documentfile', + name='search_vector', + ), # also deletes `search_vector_trigger` and general_doc_search__752b22_gin + migrations.AddField( + model_name='documentfile', + name='search_vector', + field=models.GeneratedField(db_persist=True, expression=django.contrib.postgres.search.CombinedSearchVector( + django.contrib.postgres.search.CombinedSearchVector( + django.contrib.postgres.search.SearchVector('title', config='english', weight='A'), '||', + django.contrib.postgres.search.SearchVector('description', config='english', weight='B'), + django.contrib.postgres.search.SearchConfig('english')), '||', + django.contrib.postgres.search.SearchVector('document_data', config='english', weight='C'), + django.contrib.postgres.search.SearchConfig('english')), null=True, + output_field=django.contrib.postgres.search.SearchVectorField()), + ), + migrations.AddIndex( + model_name='documentfile', + index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], + name='general_doc_search__752b22_gin'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index b7dd012c..064d7b8a 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -1,5 +1,5 @@ from django.contrib.postgres.indexes import GinIndex -from django.contrib.postgres.search import SearchVectorField +from django.contrib.postgres.search import SearchVector, SearchVectorField from django.core.validators import FileExtensionValidator from django.db import models from django.utils.translation import gettext_lazy as _ @@ -142,7 +142,19 @@ class DocumentFile(models.Model): subjects = models.ManyToManyField("Subject", blank=True, verbose_name=_("subjects")) languages = models.ManyToManyField("Language", blank=True, verbose_name=_("languages")) - search_vector = SearchVectorField(null=True, blank=True) + # `config="english"` is required to ensure that the `expression` is + # immutable, otherwise the migration to the GeneratedField fails. + search_vector = models.GeneratedField( + expression=( + SearchVector("title", config="english", weight="A") + + SearchVector("description", config="english", weight="B") + + SearchVector("document_data", config="english", weight="C") + ), + output_field=SearchVectorField(), + db_persist=True, + null=True, + blank=True, + ) # added simple historical records to the model history = HistoricalRecords(excluded_fields=["document_data", "search_vector"]) From 5bdc4ee759a8fab81e6d11b5969b5a5cfddc866d Mon Sep 17 00:00:00 2001 From: Friedel Wolff Date: Thu, 25 Jul 2024 10:03:17 +0200 Subject: [PATCH 6/8] Search performance improvements The `UNION ALL` improves performance, particularly of the `COUNT` query that is needed for pagination. Smaller headlines are hopefully also a usability improvement. --- app/general/filters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/app/general/filters.py b/app/general/filters.py index b49adddf..90fb1b95 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -59,7 +59,7 @@ def filter_queryset(self, queryset): Project.objects.annotate( heading=F("name"), view=Value("project_detail"), - search_headline=SearchHeadline("description", query), + search_headline=SearchHeadline("description", query, max_words=15, min_words=10), rank=SearchRank(project_search_vector, query, normalization=16), search=project_search_vector, ) @@ -69,7 +69,9 @@ def filter_queryset(self, queryset): # We limit the headline to limit the performance impact. On very large # documents, this slows things down if unconstrained. - search_headline = SearchHeadline(Left("document_data", 200_000), query) + search_headline = SearchHeadline( + Left("document_data", 20_000), query, max_words=15, min_words=10 + ) search_rank = SearchRank(F("search_vector"), query, normalization=16) queryset = queryset.annotate( heading=F("title"), @@ -77,7 +79,7 @@ def filter_queryset(self, queryset): rank=search_rank, search_headline=search_headline, ).values(*fields) - return queryset.union(project_query).order_by("-rank") + return queryset.union(project_query, all=True).order_by("-rank") def filter_search(self, queryset, name, value): if value: From 3d9c74a7261c5e3a22e8aee8ac8b47fe203880bb Mon Sep 17 00:00:00 2001 From: Daniel Gray Date: Thu, 25 Jul 2024 10:39:04 +0200 Subject: [PATCH 7/8] +logo_url in search results --- app/general/filters.py | 12 +++++++++++- app/templates/app/search.html | 8 +++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/app/general/filters.py b/app/general/filters.py index 90fb1b95..12f670a4 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -48,7 +48,15 @@ def filter_queryset(self, queryset): # A fixed list of identical fields are required to join queries of # different classes with `.union()`: - fields = ("id", "heading", "description", "rank", "search_headline", "view") + fields = ( + "id", + "heading", + "description", + "rank", + "search_headline", + "view", + "logo_url", + ) # In the queries below, any differences between models must be fixed # through e.g. `Value` or `F` annotations. @@ -59,6 +67,7 @@ def filter_queryset(self, queryset): Project.objects.annotate( heading=F("name"), view=Value("project_detail"), + logo_url=F("logo"), search_headline=SearchHeadline("description", query, max_words=15, min_words=10), rank=SearchRank(project_search_vector, query, normalization=16), search=project_search_vector, @@ -76,6 +85,7 @@ def filter_queryset(self, queryset): queryset = queryset.annotate( heading=F("title"), view=Value("document_detail"), + logo_url=Value(""), rank=search_rank, search_headline=search_headline, ).values(*fields) diff --git a/app/templates/app/search.html b/app/templates/app/search.html index f5ee4b62..910fa73f 100644 --- a/app/templates/app/search.html +++ b/app/templates/app/search.html @@ -20,12 +20,18 @@
{% trans "Search a term" %}
{% for result in search_results %}
-

{{ result.heading }}

+

{{ result.heading }}

{{ result.description | truncatewords:10 }}

{% trans "Excerpt:" %} {{ result.search_headline|safe }}

+

+ {% if result.logo_url %} + + {% endif %} +

{{ result.rank }}

From 92390b032cd197d9caf0d3439f7a80552a2d038e Mon Sep 17 00:00:00 2001 From: Daniel Gray Date: Thu, 25 Jul 2024 10:41:14 +0200 Subject: [PATCH 8/8] Update search tests for combined results --- app/general/tests/test_filter.py | 70 ++++++++++++++++++++------- app/general/tests/test_view_search.py | 2 +- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/app/general/tests/test_filter.py b/app/general/tests/test_filter.py index c9d0ca92..3f78cdbf 100644 --- a/app/general/tests/test_filter.py +++ b/app/general/tests/test_filter.py @@ -3,7 +3,7 @@ from django.test import TestCase from general.filters import DocumentFileFilter -from general.models import DocumentFile, Institution, Language, Subject +from general.models import DocumentFile, Institution, Language, Project, Subject class TestSearchFilter(TestCase): @@ -38,30 +38,48 @@ def setUp(self): self.doc2.subjects.add(self.subject2) self.doc2.languages.add(self.language2) + # Create Projects for search testing + self.project1 = Project.objects.create( + name="Project 1", + description="Project 1 description", + institution=self.institution1, + logo="logo1.png", + ) + def test_institution_filter(self): data = {"institution": [self.institution1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) + + def test_institution_filter(self): + data = {"institution": [self.institution1.id]} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_document_type_filter(self): data = {"document_type": ["glossary"]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 2) - self.assertIn(self.doc1, filter.qs) - self.assertIn(self.doc2, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id]) def test_subjects_filter(self): data = {"subjects": [self.subject1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_languages_filter(self): data = {"languages": [self.language1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_combined_filters(self): data = { @@ -71,20 +89,36 @@ def test_combined_filters(self): "languages": [self.language1.id], } filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) - def test_search_filter(self): + def test_search_filter_documents(self): data = {"search": "Document"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 4) - self.assertIn(self.doc1, filter.qs) - self.assertIn(self.doc2, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id]) data = {"search": "Document 1"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 2) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) + + def test_search_filter_projects(self): + data = {"search": "Project 1"} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.project1.id) + + def test_search_filter_combined(self): + data = {"search": "1"} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.project1.id]) if __name__ == "__main__": diff --git a/app/general/tests/test_view_search.py b/app/general/tests/test_view_search.py index 5d0386ab..21ab0c42 100644 --- a/app/general/tests/test_view_search.py +++ b/app/general/tests/test_view_search.py @@ -48,7 +48,7 @@ def test_search_filtering(self): client = Client() response = client.get(reverse("search"), {"search": "Document 1"}) self.assertEqual(response.status_code, 200) - self.assertEqual(response.context["documents"][0].title, "Document 1") + self.assertEqual(response.context["documents"][0]["heading"], "Document 1") def test_invalid_page_number(self): client = Client()