Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/search weight2 #101

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()
self.fields["search_vector"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
Expand Down
75 changes: 58 additions & 17 deletions app/general/filters.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import django_filters
from django import forms
from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank
from django.db.models import F
from django.contrib.postgres.search import (
SearchHeadline,
SearchQuery,
SearchRank,
SearchVector,
)
from django.db.models import F, Value
from django.db.models.functions import Greatest, Left
from django_filters import ModelMultipleChoiceFilter, MultipleChoiceFilter

from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class DocumentFileFilter(django_filters.FilterSet):
Expand Down Expand Up @@ -33,27 +39,62 @@ class Meta:
]

def filter_queryset(self, queryset):
# More information about weighting and normalization in postgres:
# https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING
queryset = super().filter_queryset(queryset)

search = self.form.cleaned_data.get("search", "")
queue = SearchQuery(search.strip())
search_rank = SearchRank(F("search_vector"), queue)
search_headline = SearchHeadline("document_data", queue)
queryset = (
queryset.annotate(
rank=search_rank,
search_headline=search_headline,
search = self.form.cleaned_data.get("search", "").strip()
query = SearchQuery(search)

# A fixed list of identical fields are required to join queries of
# different classes with `.union()`:
fields = (
"id",
"heading",
"description",
"rank",
"search_headline",
"view",
"logo_url",
)

# In the queries below, any differences between models must be fixed
# through e.g. `Value` or `F` annotations.
project_search_vector = SearchVector("name", weight="A") + SearchVector(
"description", weight="B"
)
project_query = (
Project.objects.annotate(
heading=F("name"),
view=Value("project_detail"),
logo_url=F("logo"),
search_headline=SearchHeadline("description", query, max_words=15, min_words=10),
rank=SearchRank(project_search_vector, query, normalization=16),
search=project_search_vector,
)
.defer("document_data")
.select_related("institution")
).order_by("-rank")
.filter(search=query)
.values(*fields)
)

return queryset
# We limit the headline to limit the performance impact. On very large
# documents, this slows things down if unconstrained.
search_headline = SearchHeadline(
Left("document_data", 20_000), query, max_words=15, min_words=10
)
search_rank = SearchRank(F("search_vector"), query, normalization=16)
queryset = queryset.annotate(
heading=F("title"),
view=Value("document_detail"),
logo_url=Value(""),
rank=search_rank,
search_headline=search_headline,
).values(*fields)
return queryset.union(project_query, all=True).order_by("-rank")

def filter_search(self, queryset, name, value):
if value:
queue = SearchQuery(value.strip())
return queryset.filter(search_vector=queue)
query = SearchQuery(value.strip())
return queryset.filter(search_vector=query)

else:
return queryset
32 changes: 32 additions & 0 deletions app/general/migrations/0012_alter_documentfile_search_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import django.contrib.postgres.search
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
('general', '0011_alter_documentfile_options_and_more'),
]

operations = [
migrations.RemoveField(
model_name='documentfile',
name='search_vector',
), # also deletes `search_vector_trigger` and general_doc_search__752b22_gin
migrations.AddField(
model_name='documentfile',
name='search_vector',
field=models.GeneratedField(db_persist=True, expression=django.contrib.postgres.search.CombinedSearchVector(
django.contrib.postgres.search.CombinedSearchVector(
django.contrib.postgres.search.SearchVector('title', config='english', weight='A'), '||',
django.contrib.postgres.search.SearchVector('description', config='english', weight='B'),
django.contrib.postgres.search.SearchConfig('english')), '||',
django.contrib.postgres.search.SearchVector('document_data', config='english', weight='C'),
django.contrib.postgres.search.SearchConfig('english')), null=True,
output_field=django.contrib.postgres.search.SearchVectorField()),
),
migrations.AddIndex(
model_name='documentfile',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'],
name='general_doc_search__752b22_gin'),
),
]
16 changes: 14 additions & 2 deletions app/general/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.search import SearchVector, SearchVectorField
from django.core.validators import FileExtensionValidator
from django.db import models
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -142,7 +142,19 @@ class DocumentFile(models.Model):
subjects = models.ManyToManyField("Subject", blank=True, verbose_name=_("subjects"))
languages = models.ManyToManyField("Language", blank=True, verbose_name=_("languages"))

search_vector = SearchVectorField(null=True, blank=True)
# `config="english"` is required to ensure that the `expression` is
# immutable, otherwise the migration to the GeneratedField fails.
search_vector = models.GeneratedField(
expression=(
SearchVector("title", config="english", weight="A")
+ SearchVector("description", config="english", weight="B")
+ SearchVector("document_data", config="english", weight="C")
),
output_field=SearchVectorField(),
db_persist=True,
null=True,
blank=True,
)

# added simple historical records to the model
history = HistoricalRecords(excluded_fields=["document_data", "search_vector"])
Expand Down
70 changes: 52 additions & 18 deletions app/general/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from django.test import TestCase

from general.filters import DocumentFileFilter
from general.models import DocumentFile, Institution, Language, Subject
from general.models import DocumentFile, Institution, Language, Project, Subject


class TestSearchFilter(TestCase):
Expand Down Expand Up @@ -38,30 +38,48 @@ def setUp(self):
self.doc2.subjects.add(self.subject2)
self.doc2.languages.add(self.language2)

# Create Projects for search testing
self.project1 = Project.objects.create(
name="Project 1",
description="Project 1 description",
institution=self.institution1,
logo="logo1.png",
)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_institution_filter(self):
data = {"institution": [self.institution1.id]}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_document_type_filter(self):
data = {"document_type": ["glossary"]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

def test_subjects_filter(self):
data = {"subjects": [self.subject1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_languages_filter(self):
data = {"languages": [self.language1.id]}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_combined_filters(self):
data = {
Expand All @@ -71,20 +89,36 @@ def test_combined_filters(self):
"languages": [self.language1.id],
}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter(self):
def test_search_filter_documents(self):
data = {"search": "Document"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 2)
self.assertIn(self.doc1, filter.qs)
self.assertIn(self.doc2, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.doc2.id])

data = {"search": "Document 1"}
filter = DocumentFileFilter(data=data)
self.assertEqual(len(filter.qs), 1)
self.assertIn(self.doc1, filter.qs)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.doc1.id)

def test_search_filter_projects(self):
data = {"search": "Project 1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 1)
self.assertEqual(qs[0]["id"], self.project1.id)

def test_search_filter_combined(self):
data = {"search": "1"}
filter = DocumentFileFilter(data=data)
qs = filter.qs
self.assertEqual(len(qs), 2)
self.assertCountEqual([qs[0]["id"], qs[1]["id"]], [self.doc1.id, self.project1.id])


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion app/general/tests/test_view_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_search_filtering(self):
client = Client()
response = client.get(reverse("search"), {"search": "Document 1"})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["documents"][0].title, "Document 1")
self.assertEqual(response.context["documents"][0]["heading"], "Document 1")

def test_invalid_page_number(self):
client = Client()
Expand Down
70 changes: 18 additions & 52 deletions app/templates/app/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,57 +18,23 @@ <h5 class="card-title">{% trans "Search a term" %}</h5>
<br>
<br>
<div class="container">
{% for document in search_results %}
{% for result in search_results %}
<div>
<!--
<ul>
<li>
<span class="mr-5 text-primary"><b>{% trans "Title:" %}</b></span>
<span>{{ document.title }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Institution:" %}</b></span>
<span>{{ document.institution }}</span>
</li>
<li class="mb-2">
<span class="mr-5"><b>{% trans "Headline:" %}</b></span>
<span>{{ document.search_headline|safe }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.license }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.document_type }}</span>
</li>
<li>
<span class="mr-5"><b>{% trans "Mime Type:" %}</b></span>
<span>{{ document.mime_type }}</span>
</li>
<li class="mt-2">
<span class="mr-5"><b>{% trans "Rank:" %}</b></span>
<span><i>{{ document.rank }}</i></span>
</li>
</ul>
-->
<h2><a href="{% url 'document_detail' document.id %}">{{ document.title }}</a></h2>
<span><a href="{% url 'institution_detail' document.institution.id %}">{{ document.institution }}</a></span>
<div>
<span class="mr-5"><b>{% trans "Excerpt:" %}</b></span>
<span>… {{ document.search_headline|safe }} …</span>
</div>
<div>
<span class="mr-5"><b>{% trans "License:" %}</b></span>
<span>{{ document.license }}</span>
</div>
<div>
<span>{{ document.languages.all |join:", " }}</span>
</div>
<div>
<span>{{ document.subjects.all |join:", " }}</span>
</div>
<br>
<h2><a href="{% url result.view result.id %}">{{ result.heading }}</a></h2>
<p>{{ result.description | truncatewords:10 }}</p>
<p>
<span class="mr-5"><b>{% trans "Excerpt:" %}</b></span>
<span>{{ result.search_headline|safe }}</span>
</p>
<p>
{% if result.logo_url %}
<img src="/media/{{ result.logo_url }}" alt="{{ result.name }} logo"
class="project-logo">
{% endif %}
</p>
<p>
<span><i>{{ result.rank }}</i></span>
</p>
</div>
{% endfor %}
</div>
Expand All @@ -81,8 +47,8 @@ <h2><a href="{% url 'document_detail' document.id %}">{{ document.title }}</a></
<a href="{{ search_params }}&page={{ documents.previous_page_number }}">&lt; previous</a>
{% endif %}
<span class="current">
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
Page {{ documents.number }} of {{ documents.paginator.num_pages }}.
</span>
{% if documents.has_next %}
<a href="{{ search_params }}&page={{ documents.next_page_number }}">next &gt;</a>
<a href="{{ search_params }}&page={{ documents.paginator.num_pages }}">last
Expand Down
Loading