Skip to content

Commit

Permalink
added projects to search filters
Browse files Browse the repository at this point in the history
- Combine search queries with union
- updated tests
  • Loading branch information
daniel-gray-tangent committed Jul 25, 2024
1 parent cb1a749 commit 6f558f6
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 92 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ app/media/
/logging/
/pdf_uploads/
/pdf_upload_completed/
/tmp/
/tmp/app_punt_env
1 change: 0 additions & 1 deletion app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()
self.fields["search_vector"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
Expand Down
74 changes: 57 additions & 17 deletions app/general/filters.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import django_filters
from django import forms
from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank
from django.db.models import F
from django.contrib.postgres.search import (
SearchHeadline,
SearchQuery,
SearchRank,
SearchVector,
)
from django.db.models import F, Value
from django.db.models.functions import Greatest, Left
from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter

from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class DocumentFileFilter(django_filters.FilterSet):
Expand Down Expand Up @@ -33,27 +39,61 @@ class Meta:
]

def filter_queryset(self, queryset):
# More information about weighting and normalization in postgres:
# https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING
queryset = super().filter_queryset(queryset)

search = self.form.cleaned_data.get("search", "")
queue = SearchQuery(search.strip())
search_rank = SearchRank(F("search_vector"), queue)
search_headline = SearchHeadline("document_data", queue)
queryset = (
queryset.annotate(
rank=search_rank,
search_headline=search_headline,
search = self.form.cleaned_data.get("search", "").strip()
query = SearchQuery(search)

# A fixed list of identical fields are required to join queries of
# different classes with `.union()`:
fields = (
"id",
"heading",
"description",
"rank",
"search_headline",
"view",
"logo_url",
"logo_url",
)

# In the queries below, any differences between models must be fixed
# through e.g. `Value` or `F` annotations.
project_search_vector = SearchVector("name", weight="A") + SearchVector(
"description", weight="B"
)
project_query = (
Project.objects.annotate(
heading=F("name"),
view=Value("project_detail"),
logo_url=F("logo"),
search_headline=SearchHeadline("description", query),
rank=SearchRank(project_search_vector, query, normalization=16),
search=project_search_vector,
)
.defer("document_data")
.select_related("institution")
).order_by("-rank")
.filter(search=query)
.values(*fields)
)

return queryset
# We limit the headline to limit the performance impact. On very large
# documents, this slows things down if unconstrained.
search_headline = SearchHeadline(Left("document_data", 200_000), query)
search_rank = SearchRank(F("search_vector"), query, normalization=16)
queryset = queryset.annotate(
heading=F("title"),
view=Value("document_detail"),
logo_url=Value("no_logo"),
rank=search_rank,
search_headline=search_headline,
).values(*fields)
return queryset.union(project_query).order_by("-rank")

def filter_search(self, queryset, name, value):
if value:
queue = SearchQuery(value.strip())
return queryset.filter(search_vector=queue)
query = SearchQuery(value.strip())
return queryset.filter(search_vector=query)

else:
return queryset
32 changes: 32 additions & 0 deletions app/general/migrations/0012_alter_documentfile_search_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import django.contrib.postgres.search
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
('general', '0011_alter_documentfile_options_and_more'),
]

operations = [
migrations.RemoveField(
model_name='documentfile',
name='search_vector',
), # also deletes `search_vector_trigger` and general_doc_search__752b22_gin
migrations.AddField(
model_name='documentfile',
name='search_vector',
field=models.GeneratedField(db_persist=True, expression=django.contrib.postgres.search.CombinedSearchVector(
django.contrib.postgres.search.CombinedSearchVector(
django.contrib.postgres.search.SearchVector('title', config='english', weight='A'), '||',
django.contrib.postgres.search.SearchVector('description', config='english', weight='B'),
django.contrib.postgres.search.SearchConfig('english')), '||',
django.contrib.postgres.search.SearchVector('document_data', config='english', weight='C'),
django.contrib.postgres.search.SearchConfig('english')), null=True,
output_field=django.contrib.postgres.search.SearchVectorField()),
),
migrations.AddIndex(
model_name='documentfile',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'],
name='general_doc_search__752b22_gin'),
),
]
16 changes: 14 additions & 2 deletions app/general/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.search import SearchVector, SearchVectorField
from django.core.validators import FileExtensionValidator
from django.db import models
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -142,7 +142,19 @@ class DocumentFile(models.Model):
subjects = models.ManyToManyField("Subject", blank=True, verbose_name=_("subjects"))
languages = models.ManyToManyField("Language", blank=True, verbose_name=_("languages"))

search_vector = SearchVectorField(null=True, blank=True)
# `config="english"` is required to ensure that the `expression` is
# immutable, otherwise the migration to the GeneratedField fails.
search_vector = models.GeneratedField(
expression=(
SearchVector("title", config="english", weight="A")
+ SearchVector("description", config="english", weight="B")
+ SearchVector("document_data", config="english", weight="C")
),
output_field=SearchVectorField(),
db_persist=True,
null=True,
blank=True,
)

# added simple historical records to the model
history = HistoricalRecords(excluded_fields=["document_data", "search_vector"])
Expand Down
70 changes: 52 additions & 18 deletions app/general/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from django.test import TestCase

from general.filters import DocumentFileFilter
from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class TestSearchFilter(TestCase):
Expand Down Expand Up @@ -38,30 +38,48 @@ def setUp(self):
self.doc2.subjects.add(self.subject2)
self.doc2.languages.add(self.language2)

# Create Projects for search testing
self.project1 = Project.objects.create(
name="Project 1",
description="Project 1 description",
institution=self.institution1,
logo="logo1.png",
)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_document_type_filter(self):
data = {"document_type": ["glossary"]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

def test_subjects_filter(self):
data = {"subjects": [self.subject1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_languages_filter(self):
data = {"languages": [self.language1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_combined_filters(self):
data = {
Expand All @@ -71,20 +89,36 @@ def test_combined_filters(self):
"languages": [self.language1.id],
}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter(self):
def test_search_filter_documents(self):
data = {"search": "Document"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

data = {"search": "Document 1"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter_projects(self):
data = {"search": "Project 1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.project1.id)

def test_search_filter_combined(self):
data = {"search": "1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.project1.id])


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion app/general/tests/test_view_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_search_filtering(self):
client = Client()
response = client.get(reverse("search"), {"search": "Document 1"})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["documents"][0].title, "Document 1")
self.assertEqual(response.context["documents"][0]["heading"], "Document 1")

def test_invalid_page_number(self):
client = Client()
Expand Down
72 changes: 19 additions & 53 deletions app/templates/app/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,57 +18,23 @@ <h5 class="card-title">{% trans "Search a term" %}</h5>
<br>
<br>
<div class="container">
{% for document in search_results %}
{% for result in search_results %}
<div>
<!--
<ul>
<li>
<span class="mr-5 text-primary"><b>{% trans "Title:" %}</b></span>
<span>{{ document.title }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Institution:" %}</b></span>
<span>{{ document.institution }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Headline:" %}</b></span>
<span>{{ document.search_headline|safe }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.license }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.document_type }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "Mime Type:" %}</b></span>
<span>{{ document.mime_type }}</span>
</li>
<li class="mt-2">
<span class="mr-5"><b>{% trans "Rank:" %}</b></span>
<span><i>{{ document.rank }}</i></span>
</li>
</ul>
-->
<h2><a href="{% url 'document_detail' document.id %}">{{ document.title }}</a></h2>
<span><a href="{% url 'institution_detail' document.institution.id %}">{{ document.institution }}</a></span>
<div>
<span class="mr-5"><b>{% trans "Excerpt:" %}</b></span>
<span>… {{ document.search_headline|safe }} …</span>
</div>
<div>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.license }}</span>
</div>
<div>
<span>{{ document.languages.all |join:", " }}</span>
</div>
<div>
<span>{{ document.subjects.all |join:", " }}</span>
</div>
<br>
<h2><a href="{% url result.view result.id %}">{{ result.heading }}</a></h2>
<p>{{ result.description | truncatewords:10 }}</p>
<p>
<span class="mr-5"><b>{% trans "Excerpt:" %}</b></span>
<span>{{ result.search_headline|safe }}</span>
</p>
<p>
{% if result.logo_url != "no_logo" %}
<img src="/media/{{ result.logo_url }}" alt="{{ result.name }} logo"
class="project-logo">
{% endif %}
</p>
<p>
<span><i>{{ result.rank }}</i></span>
</p>
</div>
{% endfor %}
</div>
Expand All @@ -81,8 +47,8 @@ <h2><a href="{% url 'document_detail' document.id %}">{{ document.title }}</a></
<a href="{{ search_params }}&page={{ documents.previous_page_number }}">&lt; previous</a>
{% endif %}
<span class="current">
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
{% if documents.has_next %}
<a href="{{ search_params }}&page={{ documents.next_page_number }}">next &gt;</a>
<a href="{{ search_params }}&page={{ documents.paginator.num_pages }}">last
Expand Down Expand Up @@ -168,4 +134,4 @@ <h2><a href="{% url 'document_detail' document.id %}">{{ document.title }}</a></
</div>
</div>

{% endblock content %}
{% endblock content %}

0 comments on commit 6f558f6

Please sign in to comment.