diff --git a/.gitignore b/.gitignore index 168936e7..e3445421 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,5 @@ app/media/ /logging/ /pdf_uploads/ /pdf_upload_completed/ +/tmp/ +/tmp/app_punt_env diff --git a/app/general/admin.py b/app/general/admin.py index a91edac3..e22ede8d 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -17,7 +17,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields["document_data"].widget = HiddenInput() - self.fields["search_vector"].widget = HiddenInput() # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: diff --git a/app/general/filters.py b/app/general/filters.py index 5f3ecbc4..787e386c 100644 --- a/app/general/filters.py +++ b/app/general/filters.py @@ -1,10 +1,16 @@ import django_filters from django import forms -from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank -from django.db.models import F +from django.contrib.postgres.search import ( + SearchHeadline, + SearchQuery, + SearchRank, + SearchVector, +) +from django.db.models import F, Value +from django.db.models.functions import Greatest, Left from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter -from general.models import DocumentFile, Institution, Language, Subject +from general.models import DocumentFile, Institution, Language, Project, Subject class DocumentFileFilter(django_filters.FilterSet): @@ -33,27 +39,61 @@ class Meta: ] def filter_queryset(self, queryset): + # More information about weighting and normalization in postgres: + # https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING queryset = super().filter_queryset(queryset) - search = self.form.cleaned_data.get("search", "") - queue = SearchQuery(search.strip()) - search_rank = SearchRank(F("search_vector"), queue) - search_headline = SearchHeadline("document_data", queue) - queryset = ( - queryset.annotate( - rank=search_rank, - search_headline=search_headline, + search = self.form.cleaned_data.get("search", "").strip() + query = SearchQuery(search) + + # A fixed list of identical fields are required to join queries of + # different classes with `.union()`: + fields = ( + "id", + "heading", + "description", + "rank", + "search_headline", + "view", + "logo_url", + "logo_url", + ) + + # In the queries below, any differences between models must be fixed + # through e.g. `Value` or `F` annotations. + project_search_vector = SearchVector("name", weight="A") + SearchVector( + "description", weight="B" + ) + project_query = ( + Project.objects.annotate( + heading=F("name"), + view=Value("project_detail"), + logo_url=F("logo"), + search_headline=SearchHeadline("description", query), + rank=SearchRank(project_search_vector, query, normalization=16), + search=project_search_vector, ) - .defer("document_data") - .select_related("institution") - ).order_by("-rank") + .filter(search=query) + .values(*fields) + ) - return queryset + # We limit the headline to limit the performance impact. On very large + # documents, this slows things down if unconstrained. + search_headline = SearchHeadline(Left("document_data", 200_000), query) + search_rank = SearchRank(F("search_vector"), query, normalization=16) + queryset = queryset.annotate( + heading=F("title"), + view=Value("document_detail"), + logo_url=Value("no_logo"), + rank=search_rank, + search_headline=search_headline, + ).values(*fields) + return queryset.union(project_query).order_by("-rank") def filter_search(self, queryset, name, value): if value: - queue = SearchQuery(value.strip()) - return queryset.filter(search_vector=queue) + query = SearchQuery(value.strip()) + return queryset.filter(search_vector=query) else: return queryset diff --git a/app/general/migrations/0012_alter_documentfile_search_vector.py b/app/general/migrations/0012_alter_documentfile_search_vector.py new file mode 100644 index 00000000..3e5bc8ec --- /dev/null +++ b/app/general/migrations/0012_alter_documentfile_search_vector.py @@ -0,0 +1,27 @@ +# Generated by Django 5.0.6 on 2024-07-24 12:00 + +import django.contrib.postgres.search +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0011_alter_documentfile_options_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='documentfile', + name='search_vector', + ), # also deletes `search_vector_trigger` and general_doc_search__752b22_gin + migrations.AddField( + model_name='documentfile', + name='search_vector', + field=models.GeneratedField(db_persist=True, expression=django.contrib.postgres.search.CombinedSearchVector(django.contrib.postgres.search.CombinedSearchVector(django.contrib.postgres.search.SearchVector('title', config='english', weight='A'), '||', django.contrib.postgres.search.SearchVector('description', config='english', weight='B'), django.contrib.postgres.search.SearchConfig('english')), '||', django.contrib.postgres.search.SearchVector('document_data', config='english', weight='C'), django.contrib.postgres.search.SearchConfig('english')), null=True, output_field=django.contrib.postgres.search.SearchVectorField()), + ), + migrations.AddIndex( + model_name='documentfile', + index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='general_doc_search__752b22_gin'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index b7dd012c..064d7b8a 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -1,5 +1,5 @@ from django.contrib.postgres.indexes import GinIndex -from django.contrib.postgres.search import SearchVectorField +from django.contrib.postgres.search import SearchVector, SearchVectorField from django.core.validators import FileExtensionValidator from django.db import models from django.utils.translation import gettext_lazy as _ @@ -142,7 +142,19 @@ class DocumentFile(models.Model): subjects = models.ManyToManyField("Subject", blank=True, verbose_name=_("subjects")) languages = models.ManyToManyField("Language", blank=True, verbose_name=_("languages")) - search_vector = SearchVectorField(null=True, blank=True) + # `config="english"` is required to ensure that the `expression` is + # immutable, otherwise the migration to the GeneratedField fails. + search_vector = models.GeneratedField( + expression=( + SearchVector("title", config="english", weight="A") + + SearchVector("description", config="english", weight="B") + + SearchVector("document_data", config="english", weight="C") + ), + output_field=SearchVectorField(), + db_persist=True, + null=True, + blank=True, + ) # added simple historical records to the model history = HistoricalRecords(excluded_fields=["document_data", "search_vector"]) diff --git a/app/general/tests/test_filter.py b/app/general/tests/test_filter.py index 1cc5650c..3f78cdbf 100644 --- a/app/general/tests/test_filter.py +++ b/app/general/tests/test_filter.py @@ -3,7 +3,7 @@ from django.test import TestCase from general.filters import DocumentFileFilter -from general.models import DocumentFile, Institution, Language, Subject +from general.models import DocumentFile, Institution, Language, Project, Subject class TestSearchFilter(TestCase): @@ -38,30 +38,48 @@ def setUp(self): self.doc2.subjects.add(self.subject2) self.doc2.languages.add(self.language2) + # Create Projects for search testing + self.project1 = Project.objects.create( + name="Project 1", + description="Project 1 description", + institution=self.institution1, + logo="logo1.png", + ) + def test_institution_filter(self): data = {"institution": [self.institution1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) + + def test_institution_filter(self): + data = {"institution": [self.institution1.id]} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_document_type_filter(self): data = {"document_type": ["glossary"]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 2) - self.assertIn(self.doc1, filter.qs) - self.assertIn(self.doc2, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id]) def test_subjects_filter(self): data = {"subjects": [self.subject1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_languages_filter(self): data = {"languages": [self.language1.id]} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) def test_combined_filters(self): data = { @@ -71,20 +89,36 @@ def test_combined_filters(self): "languages": [self.language1.id], } filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) - def test_search_filter(self): + def test_search_filter_documents(self): data = {"search": "Document"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 2) - self.assertIn(self.doc1, filter.qs) - self.assertIn(self.doc2, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id]) data = {"search": "Document 1"} filter = DocumentFileFilter(data=data) - self.assertEqual(len(filter.qs), 1) - self.assertIn(self.doc1, filter.qs) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.doc1.id) + + def test_search_filter_projects(self): + data = {"search": "Project 1"} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 1) + self.assertEqual(qs[0]["id"], self.project1.id) + + def test_search_filter_combined(self): + data = {"search": "1"} + filter = DocumentFileFilter(data=data) + qs = filter.qs + self.assertEqual(len(qs), 2) + self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.project1.id]) if __name__ == "__main__": diff --git a/app/general/tests/test_view_search.py b/app/general/tests/test_view_search.py index 5d0386ab..21ab0c42 100644 --- a/app/general/tests/test_view_search.py +++ b/app/general/tests/test_view_search.py @@ -48,7 +48,7 @@ def test_search_filtering(self): client = Client() response = client.get(reverse("search"), {"search": "Document 1"}) self.assertEqual(response.status_code, 200) - self.assertEqual(response.context["documents"][0].title, "Document 1") + self.assertEqual(response.context["documents"][0]["heading"], "Document 1") def test_invalid_page_number(self): client = Client() diff --git a/app/templates/app/search.html b/app/templates/app/search.html index 4709731b..97524242 100644 --- a/app/templates/app/search.html +++ b/app/templates/app/search.html @@ -18,42 +18,23 @@
{{ result.description | truncatewords:10 }}
++ {% trans "Excerpt:" %} + {{ result.search_headline|safe }} +
++ {% if result.logo_url != "no_logo" %} + + {% endif %} +
++ {{ result.rank }} +