Skip to content

Commit

Permalink
added projects to search filters
Browse files Browse the repository at this point in the history
updated tests

Combine search queries with union

Performance is inconsitent and often bad without limiting the span
of the search headline. It might need to be limited further. The
count query (caused by pagination) is often slow too. This should
be investigated to see if a simpler query could provide the count.

Most fields are removed from the template, as we're working towards
a simpler page anyway.

First stab at search weighting and normalization

Small cleanups

Store weights in generated documentfile.search_vector

This stores the correct weighted vector, and includes the document
description that was missing.

backup for code

fixed tests
  • Loading branch information
daniel-gray-tangent committed Jul 25, 2024
1 parent b248001 commit 7303de3
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 78 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ app/media/
/logging/
/pdf_uploads/
/pdf_upload_completed/
/tmp/
/tmp/app_punt_env
1 change: 0 additions & 1 deletion app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()
self.fields["search_vector"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
Expand Down
74 changes: 57 additions & 17 deletions app/general/filters.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import django_filters
from django import forms
from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank
from django.db.models import F
from django.contrib.postgres.search import (
SearchHeadline,
SearchQuery,
SearchRank,
SearchVector,
)
from django.db.models import F, Value
from django.db.models.functions import Greatest, Left
from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter

from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class DocumentFileFilter(django_filters.FilterSet):
Expand Down Expand Up @@ -33,27 +39,61 @@ class Meta:
]

def filter_queryset(self, queryset):
# More information about weighting and normalization in postgres:
# https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING
queryset = super().filter_queryset(queryset)

search = self.form.cleaned_data.get("search", "")
queue = SearchQuery(search.strip())
search_rank = SearchRank(F("search_vector"), queue)
search_headline = SearchHeadline("document_data", queue)
queryset = (
queryset.annotate(
rank=search_rank,
search_headline=search_headline,
search = self.form.cleaned_data.get("search", "").strip()
query = SearchQuery(search)

# A fixed list of identical fields are required to join queries of
# different classes with `.union()`:
fields = (
"id",
"heading",
"description",
"rank",
"search_headline",
"view",
"logo_url",
"logo_url",
)

# In the queries below, any differences between models must be fixed
# through e.g. `Value` or `F` annotations.
project_search_vector = SearchVector("name", weight="A") + SearchVector(
"description", weight="B"
)
project_query = (
Project.objects.annotate(
heading=F("name"),
view=Value("project_detail"),
logo_url=F("logo"),
search_headline=SearchHeadline("description", query),
rank=SearchRank(project_search_vector, query, normalization=16),
search=project_search_vector,
)
.defer("document_data")
.select_related("institution")
).order_by("-rank")
.filter(search=query)
.values(*fields)
)

return queryset
# We limit the headline to limit the performance impact. On very large
# documents, this slows things down if unconstrained.
search_headline = SearchHeadline(Left("document_data", 200_000), query)
search_rank = SearchRank(F("search_vector"), query, normalization=16)
queryset = queryset.annotate(
heading=F("title"),
view=Value("document_detail"),
logo_url=Value("no_logo"),
rank=search_rank,
search_headline=search_headline,
).values(*fields)
return queryset.union(project_query).order_by("-rank")

def filter_search(self, queryset, name, value):
if value:
queue = SearchQuery(value.strip())
return queryset.filter(search_vector=queue)
query = SearchQuery(value.strip())
return queryset.filter(search_vector=query)

else:
return queryset
27 changes: 27 additions & 0 deletions app/general/migrations/0012_alter_documentfile_search_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Generated by Django 5.0.6 on 2024-07-24 12:00

import django.contrib.postgres.search
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('general', '0011_alter_documentfile_options_and_more'),
]

operations = [
migrations.RemoveField(
model_name='documentfile',
name='search_vector',
), # also deletes `search_vector_trigger` and general_doc_search__752b22_gin
migrations.AddField(
model_name='documentfile',
name='search_vector',
field=models.GeneratedField(db_persist=True, expression=django.contrib.postgres.search.CombinedSearchVector(django.contrib.postgres.search.CombinedSearchVector(django.contrib.postgres.search.SearchVector('title', config='english', weight='A'), '||', django.contrib.postgres.search.SearchVector('description', config='english', weight='B'), django.contrib.postgres.search.SearchConfig('english')), '||', django.contrib.postgres.search.SearchVector('document_data', config='english', weight='C'), django.contrib.postgres.search.SearchConfig('english')), null=True, output_field=django.contrib.postgres.search.SearchVectorField()),
),
migrations.AddIndex(
model_name='documentfile',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='general_doc_search__752b22_gin'),
),
]
16 changes: 14 additions & 2 deletions app/general/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.search import SearchVector, SearchVectorField
from django.core.validators import FileExtensionValidator
from django.db import models
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -142,7 +142,19 @@ class DocumentFile(models.Model):
subjects = models.ManyToManyField("Subject", blank=True, verbose_name=_("subjects"))
languages = models.ManyToManyField("Language", blank=True, verbose_name=_("languages"))

search_vector = SearchVectorField(null=True, blank=True)
# `config="english"` is required to ensure that the `expression` is
# immutable, otherwise the migration to the GeneratedField fails.
search_vector = models.GeneratedField(
expression=(
SearchVector("title", config="english", weight="A")
+ SearchVector("description", config="english", weight="B")
+ SearchVector("document_data", config="english", weight="C")
),
output_field=SearchVectorField(),
db_persist=True,
null=True,
blank=True,
)

# added simple historical records to the model
history = HistoricalRecords(excluded_fields=["document_data", "search_vector"])
Expand Down
70 changes: 52 additions & 18 deletions app/general/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from django.test import TestCase

from general.filters import DocumentFileFilter
from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class TestSearchFilter(TestCase):
Expand Down Expand Up @@ -38,30 +38,48 @@ def setUp(self):
self.doc2.subjects.add(self.subject2)
self.doc2.languages.add(self.language2)

# Create Projects for search testing
self.project1 = Project.objects.create(
name="Project 1",
description="Project 1 description",
institution=self.institution1,
logo="logo1.png",
)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_document_type_filter(self):
data = {"document_type": ["glossary"]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

def test_subjects_filter(self):
data = {"subjects": [self.subject1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_languages_filter(self):
data = {"languages": [self.language1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_combined_filters(self):
data = {
Expand All @@ -71,20 +89,36 @@ def test_combined_filters(self):
"languages": [self.language1.id],
}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter(self):
def test_search_filter_documents(self):
data = {"search": "Document"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

data = {"search": "Document 1"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter_projects(self):
data = {"search": "Project 1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.project1.id)

def test_search_filter_combined(self):
data = {"search": "1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.project1.id])


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion app/general/tests/test_view_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_search_filtering(self):
client = Client()
response = client.get(reverse("search"), {"search": "Document 1"})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["documents"][0].title, "Document 1")
self.assertEqual(response.context["documents"][0]["heading"], "Document 1")

def test_invalid_page_number(self):
client = Client()
Expand Down
55 changes: 18 additions & 37 deletions app/templates/app/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,23 @@ <h5 class="card-title">{% trans "Search a term" %}</h5>
<br>
<br>
<div class="container">
{% for document in search_results %}
{% for result in search_results %}
<div>
<ul>
<li>
<span class="mr-5 text-primary"><b>{% trans "Title:" %}</b></span>
<span>{{ document.title }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Institution:" %}</b></span>
<span>{{ document.institution }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Headline:" %}</b></span>
<span>{{ document.search_headline|safe }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "File:" %}</b></span>
<span>{{ document.uploaded_file }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.license }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.document_type }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "Mime Type:" %}</b></span>
<span>{{ document.mime_type }}</span>
</li>
<li class="mt-2">
<span class="mr-5"><b>{% trans "Rank:" %}</b></span>
<span><i>{{ document.rank }}</i></span>
</li>
</ul>
<h2><a href="{% url result.view result.id %}">{{ result.heading }}</a></h2>
<p>{{ result.description | truncatewords:10 }}</p>
<p>
<span class="mr-5"><b>{% trans "Excerpt:" %}</b></span>
<span>{{ result.search_headline|safe }}</span>
</p>
<p>
{% if result.logo_url != "no_logo" %}
<img src="/media/{{ result.logo_url }}" alt="{{ result.name }} logo"
class="project-logo">
{% endif %}
</p>
<p>
<span><i>{{ result.rank }}</i></span>
</p>
</div>
{% endfor %}
</div>
Expand All @@ -66,8 +47,8 @@ <h5 class="card-title">{% trans "Search a term" %}</h5>
<a href="{{ search_params }}&page={{ documents.previous_page_number }}">&lt; previous</a>
{% endif %}
<span class="current">
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
{% if documents.has_next %}
<a href="{{ search_params }}&page={{ documents.next_page_number }}">next &gt;</a>
<a href="{{ search_params }}&page={{ documents.paginator.num_pages }}">last
Expand Down
3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# This is is meant for local development, but should give an idea of what you
# This is meant for local development, but should give an idea of what you
# can consider in production.
version: '3.8'

services:
db:
Expand Down
13 changes: 13 additions & 0 deletions tmp/app_punt_env
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SECRET_KEY='django-insecure-w!h85bp^$e8gm%c23r!0%9i7yzd=6w$s&ic+6!%306&kj8@k*5'

#DEBUG='True'

ALLOWED_HOSTS='localhost 127.0.0.1 admin'
USE_X_FORWARDED_HOST='True'
CSRF_TRUSTED_ORIGINS="http://localhost"

DB_HOST='terminology-db'
DB_PORT=5432
DB_NAME='term_db'
DB_USER='django'
DB_PASSWORD='bogom'

0 comments on commit 7303de3

Please sign in to comment.