Skip to content

Commit

Permalink
Merge pull request #434 from nationalarchives/feature/1412-highlight-…
Browse files Browse the repository at this point in the history
…query-on-results-page

Highlight  query matches in document html
  • Loading branch information
timcowlishaw authored Nov 9, 2023
2 parents dfdda49 + 16f7bb1 commit f99030f
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 11 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog 1.0.0].

## [Unreleased]
- `document.content_as_html` now takes an optional `query=` string parameter, which, when supplied, highlights instances of the query within the document with `<mark>` tags, each of which has a numbered id indicating its sequence in the document.
- `document.number_of_mentions` method which takes a `query=` string parameter, and returns the number of highlighted mentions in the html.

## [Release 17.1.0]
- New `Client.get_combined_stats_table` method to run a combined statistics query against MarkLogic.


## [Release 17.0.0]

- BREAKING: `VersionAnnotation` now requires a statement of if the action is automated or not
Expand Down
6 changes: 5 additions & 1 deletion src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,9 @@ def __init__(
self.session.headers.update({"User-Agent": user_agent})
self.user_agent = user_agent

def get_document_by_uri(self, uri: DocumentURIString) -> Document:
def get_document_by_uri(
self, uri: DocumentURIString, query: Optional[str] = None
) -> Document:
document_type_class = self.get_document_type_from_uri(uri)
return document_type_class(uri, self)

Expand Down Expand Up @@ -684,6 +686,7 @@ def eval_xslt(
version_uri: Optional[DocumentURIString] = None,
show_unpublished: bool = False,
xsl_filename: str = DEFAULT_XSL_TRANSFORM,
query: Optional[str] = None,
) -> requests.Response:
marklogic_document_uri = self._format_uri_for_marklogic(judgment_uri)
marklogic_document_version_uri = (
Expand All @@ -707,6 +710,7 @@ def eval_xslt(
"show_unpublished": show_unpublished,
"img_location": image_location,
"xsl_filename": xsl_filename,
"query": query,
}

return self._send_to_eval(vars, "xslt_transform.xqy")
Expand Down
42 changes: 36 additions & 6 deletions src/caselawclient/models/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
from ds_caselaw_utils import courts
from ds_caselaw_utils.courts import CourtNotFoundException
from lxml import etree
from lxml import html as html_parser
from requests_toolbelt.multipart import decoder

from caselawclient.models.utilities import extract_version

from ..errors import (
DocumentNotFoundError,
GatewayTimeoutError,
NotSupportedOnVersion,
OnlySupportedOnVersion,
)
Expand All @@ -32,6 +34,10 @@ class UnparsableDate(Warning):
pass


class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning):
pass


DOCUMENT_STATUS_HOLD = "On hold"
""" This document has been placed on hold to actively prevent publication. """

Expand Down Expand Up @@ -328,12 +334,36 @@ def content_as_xml_bytestring(self) -> bytes:
def content_as_xml_tree(self) -> Any:
return etree.fromstring(self.content_as_xml_bytestring)

def content_as_html(self, version_uri: Optional[DocumentURIString] = None) -> str:
results = self.api_client.eval_xslt(
self.uri, version_uri, show_unpublished=True
)
multipart_data = decoder.MultipartDecoder.from_response(results)
return str(multipart_data.parts[0].text)
def content_as_html(
self,
version_uri: Optional[DocumentURIString] = None,
query: Optional[str] = None,
) -> str:
try:
results = self.api_client.eval_xslt(
self.uri, version_uri, show_unpublished=True, query=query
)
multipart_data = decoder.MultipartDecoder.from_response(results)
return str(multipart_data.parts[0].text)
except GatewayTimeoutError as e:
if query is not None:
warnings.warn(
(
"Gateway timeout when getting content with query"
"highlighting for document %s, version %s, and query"
'"%s", falling back to unhighlighted content...'
)
% (self.uri, version_uri, query),
GatewayTimeoutGettingHTMLWithQuery,
)
return self.content_as_html(version_uri)
else:
raise e

def number_of_mentions(self, query: str) -> int:
html = self.content_as_html(query=query)
tree = html_parser.fromstring(html.encode("utf-8"))
return len(tree.findall(".//mark"))

@cached_property
def is_failure(self) -> bool:
Expand Down
38 changes: 37 additions & 1 deletion src/caselawclient/xquery/xslt_transform.xqy
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
xquery version "1.0-ml";

import module namespace helper = "https://caselaw.nationalarchives.gov.uk/helper" at "/judgments/search/helper.xqy";

declare variable $show_unpublished as xs:boolean? external;
declare variable $uri as xs:string external;
declare variable $version_uri as xs:string? external;
declare variable $img_location as xs:string? external;
declare variable $xsl_filename as xs:string? external;
declare variable $query as xs:string? external;

let $judgment_published_property := xdmp:document-get-properties($uri, xs:QName("published"))[1]
let $is_published := $judgment_published_property/text()
Expand All @@ -14,6 +17,27 @@ let $xsl_path := fn:concat("judgments/xslts/", $xsl_filename)

let $params := map:map()

let $number_marks_xslt := (
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="2.0">
<xsl:output method="html" />
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="mark">
<xsl:copy>
<xsl:copy-of select="@*" />
<xsl:attribute name="id">
<xsl:text>mark_</xsl:text>
<xsl:value-of select="count(preceding::mark)"/>
</xsl:attribute>
<xsl:apply-templates />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
)
(: change the image-base of the document to match the location of the assets in $image_base
so that references to images point to the correct places on the internet :)
let $_put := map:put(
Expand All @@ -26,12 +50,24 @@ let $_ := if (not(exists($document_to_transform))) then
fn:error(xs:QName("FCL_DOCUMENTNOTFOUND"), "No XML document was found to transform")
) else ()

let $return_value := if (xs:boolean($is_published) or $show_unpublished) then
let $retrieved_value := if (xs:boolean($is_published) or $show_unpublished) then
xdmp:xslt-invoke($xsl_path,
$document_to_transform,
$params
)/element()
else
()

let $return_value := if($query) then
xdmp:xslt-eval(
$number_marks_xslt,
cts:highlight(
$retrieved_value,
helper:make-q-query($query),
<mark>{$cts:text}</mark>
)
)
else
$retrieved_value

return $return_value
1 change: 1 addition & 0 deletions src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ class XsltDict(MarkLogicAPIDict):
# xslt_transform.xqy
class XsltTransformDict(MarkLogicAPIDict):
img_location: Optional[str]
query: Optional[str]
show_unpublished: Optional[bool]
uri: MarkLogicDocumentURIString
version_uri: Optional[MarkLogicDocumentVersionURIString]
Expand Down
39 changes: 36 additions & 3 deletions tests/client/test_eval_xslt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import patch

from caselawclient.Client import ROOT_DIR, MarklogicApiClient
from caselawclient.xquery_type_dicts import XsltTransformDict


class TestEvalXslt(unittest.TestCase):
Expand All @@ -18,12 +19,13 @@ def test_eval_xslt_user_can_view_unpublished(self):
self.client, "user_can_view_unpublished_judgments", return_value=True
):
uri = "/judgment/uri"
expected_vars = {
expected_vars: XsltTransformDict = {
"uri": "/judgment/uri.xml",
"version_uri": None,
"show_unpublished": True,
"img_location": "imagepath",
"xsl_filename": "accessible-html.xsl",
"query": None,
}
self.client.eval_xslt(uri, show_unpublished=True)

Expand All @@ -44,12 +46,13 @@ def test_eval_xslt_user_cannot_view_unpublished(self):
):
with patch.object(logging, "warning") as mock_logging:
uri = "/judgment/uri"
expected_vars = {
expected_vars: XsltTransformDict = {
"uri": "/judgment/uri.xml",
"version_uri": None,
"show_unpublished": False,
"img_location": "imagepath",
"xsl_filename": "accessible-html.xsl",
"query": None,
}
self.client.eval_xslt(uri, show_unpublished=True)

Expand All @@ -68,12 +71,13 @@ def test_eval_xslt_with_filename(self):
self.client, "user_can_view_unpublished_judgments", return_value=True
):
uri = "/judgment/uri"
expected_vars = {
expected_vars: XsltTransformDict = {
"uri": "/judgment/uri.xml",
"version_uri": None,
"show_unpublished": True,
"img_location": "imagepath",
"xsl_filename": "as-handed-down.xsl",
"query": None,
}
self.client.eval_xslt(
uri, show_unpublished=True, xsl_filename="as-handed-down.xsl"
Expand All @@ -83,3 +87,32 @@ def test_eval_xslt_with_filename(self):
os.path.join(ROOT_DIR, "xquery", "xslt_transform.xqy")
)
assert mock_eval.call_args.kwargs["vars"] == json.dumps(expected_vars)

@patch.dict(os.environ, {"XSLT_IMAGE_LOCATION": "imagepath"}, clear=True)
def test_eval_xslt_with_query(self):
with patch.object(self.client, "eval") as mock_eval:
with patch.object(
self.client, "user_can_view_unpublished_judgments", return_value=True
):
uri = "/judgment/uri"
query = "the query string"
expected_vars: XsltTransformDict = {
"uri": "/judgment/uri.xml",
"version_uri": None,
"show_unpublished": True,
"img_location": "imagepath",
"xsl_filename": "as-handed-down.xsl",
"query": query,
}
self.client.eval_xslt(
uri,
show_unpublished=True,
xsl_filename="as-handed-down.xsl",
query=query,
)

assert mock_eval.call_args.args[0] == (
os.path.join(ROOT_DIR, "xquery", "xslt_transform.xqy")
)

assert mock_eval.call_args.kwargs["vars"] == json.dumps(expected_vars)
33 changes: 33 additions & 0 deletions tests/models/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
DocumentNotSafeForDeletion,
UnparsableDate,
)
from tests.test_helpers import MockMultipartResponse


@pytest.fixture
Expand Down Expand Up @@ -210,6 +211,38 @@ def test_document_version_number_when_is_version(self, mock_api_client):
assert version_document.version_number == 9
assert version_document.is_version

def test_number_of_mentions_when_no_mentions(self, mock_api_client):
mock_api_client.eval_xslt.return_value = MockMultipartResponse(
"""
<article>
<p>An article with no mark elements.</p>
</article>
""".encode(
"utf-8"
)
)

document = Document("test/1234", mock_api_client)

assert document.number_of_mentions("some") == 0

def test_number_of_mentions_when_mentions(self, mock_api_client):
mock_api_client.eval_xslt.return_value = MockMultipartResponse(
"""
<article>
<p>
An article with <mark id="mark_0">some</mark> mark elements, and <mark id="mark_1">some</mark> more.
</p>
</article>
""".encode(
"utf-8"
)
)

document = Document("test/1234", mock_api_client)

assert document.number_of_mentions("some") == 2


class TestDocumentValidation:
def test_judgment_is_failure(self, mock_api_client):
Expand Down
8 changes: 8 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from requests_toolbelt.multipart import encoder


class MockMultipartResponse:
def __init__(self, text):
multipart = encoder.MultipartEncoder({"content": text})
self.content = multipart.to_string()
self.headers = {"content-type": multipart.content_type}

0 comments on commit f99030f

Please sign in to comment.