From 16f7bb1d3e5ce7ea717cf12a1d057b4959f20083 Mon Sep 17 00:00:00 2001 From: Tim Cowlishaw Date: Tue, 24 Oct 2023 10:17:50 +0200 Subject: [PATCH] Highlight search term in judgment `document.content_as_html` now takes an optional `query` parameter which is preprocessed in the same manner as the search query, then used to highlight matching terms in the document, which are returned with a surrounding `` tag. This mark tag has an id which gives the number, in sequence of the match from 0..n in the form `mark_{x}`. A new `document.number_of_mentions` method which takes a `query` parameter and returns the number of highlighted mentions returned in the html for the query --- CHANGELOG.md | 3 ++ src/caselawclient/Client.py | 6 ++- src/caselawclient/models/documents.py | 42 ++++++++++++++++++--- src/caselawclient/xquery/xslt_transform.xqy | 38 ++++++++++++++++++- src/caselawclient/xquery_type_dicts.py | 1 + tests/client/test_eval_xslt.py | 39 +++++++++++++++++-- tests/models/test_documents.py | 33 ++++++++++++++++ tests/test_helpers.py | 8 ++++ 8 files changed, 159 insertions(+), 11 deletions(-) create mode 100644 tests/test_helpers.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a363b5be..7b41d61a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog 1.0.0]. ## [Unreleased] +- `document.content_as_html` now takes an optional `query=` string parameter, which, when supplied, highlights instances of the query within the document with `` tags, each of which has a numbered id indicating its sequence in the document. +- `document.number_of_mentions` method which takes a `query=` string parameter, and returns the number of highlighted mentions in the html. ## [Release 17.1.0] - New `Client.get_combined_stats_table` method to run a combined statistics query against MarkLogic. + ## [Release 17.0.0] - BREAKING: `VersionAnnotation` now requires a statement of if the action is automated or not diff --git a/src/caselawclient/Client.py b/src/caselawclient/Client.py index f511aec6..a1347379 100644 --- a/src/caselawclient/Client.py +++ b/src/caselawclient/Client.py @@ -205,7 +205,9 @@ def __init__( self.session.headers.update({"User-Agent": user_agent}) self.user_agent = user_agent - def get_document_by_uri(self, uri: DocumentURIString) -> Document: + def get_document_by_uri( + self, uri: DocumentURIString, query: Optional[str] = None + ) -> Document: document_type_class = self.get_document_type_from_uri(uri) return document_type_class(uri, self) @@ -684,6 +686,7 @@ def eval_xslt( version_uri: Optional[DocumentURIString] = None, show_unpublished: bool = False, xsl_filename: str = DEFAULT_XSL_TRANSFORM, + query: Optional[str] = None, ) -> requests.Response: marklogic_document_uri = self._format_uri_for_marklogic(judgment_uri) marklogic_document_version_uri = ( @@ -707,6 +710,7 @@ def eval_xslt( "show_unpublished": show_unpublished, "img_location": image_location, "xsl_filename": xsl_filename, + "query": query, } return self._send_to_eval(vars, "xslt_transform.xqy") diff --git a/src/caselawclient/models/documents.py b/src/caselawclient/models/documents.py index dfe6ec7d..0a5786a1 100644 --- a/src/caselawclient/models/documents.py +++ b/src/caselawclient/models/documents.py @@ -6,12 +6,14 @@ from ds_caselaw_utils import courts from ds_caselaw_utils.courts import CourtNotFoundException from lxml import etree +from lxml import html as html_parser from requests_toolbelt.multipart import decoder from caselawclient.models.utilities import extract_version from ..errors import ( DocumentNotFoundError, + GatewayTimeoutError, NotSupportedOnVersion, OnlySupportedOnVersion, ) @@ -32,6 +34,10 @@ class UnparsableDate(Warning): pass +class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning): + pass + + DOCUMENT_STATUS_HOLD = "On hold" """ This document has been placed on hold to actively prevent publication. """ @@ -328,12 +334,36 @@ def content_as_xml_bytestring(self) -> bytes: def content_as_xml_tree(self) -> Any: return etree.fromstring(self.content_as_xml_bytestring) - def content_as_html(self, version_uri: Optional[DocumentURIString] = None) -> str: - results = self.api_client.eval_xslt( - self.uri, version_uri, show_unpublished=True - ) - multipart_data = decoder.MultipartDecoder.from_response(results) - return str(multipart_data.parts[0].text) + def content_as_html( + self, + version_uri: Optional[DocumentURIString] = None, + query: Optional[str] = None, + ) -> str: + try: + results = self.api_client.eval_xslt( + self.uri, version_uri, show_unpublished=True, query=query + ) + multipart_data = decoder.MultipartDecoder.from_response(results) + return str(multipart_data.parts[0].text) + except GatewayTimeoutError as e: + if query is not None: + warnings.warn( + ( + "Gateway timeout when getting content with query" + "highlighting for document %s, version %s, and query" + '"%s", falling back to unhighlighted content...' + ) + % (self.uri, version_uri, query), + GatewayTimeoutGettingHTMLWithQuery, + ) + return self.content_as_html(version_uri) + else: + raise e + + def number_of_mentions(self, query: str) -> int: + html = self.content_as_html(query=query) + tree = html_parser.fromstring(html.encode("utf-8")) + return len(tree.findall(".//mark")) @cached_property def is_failure(self) -> bool: diff --git a/src/caselawclient/xquery/xslt_transform.xqy b/src/caselawclient/xquery/xslt_transform.xqy index a0908d3f..687707dd 100644 --- a/src/caselawclient/xquery/xslt_transform.xqy +++ b/src/caselawclient/xquery/xslt_transform.xqy @@ -1,10 +1,13 @@ xquery version "1.0-ml"; +import module namespace helper = "https://caselaw.nationalarchives.gov.uk/helper" at "/judgments/search/helper.xqy"; + declare variable $show_unpublished as xs:boolean? external; declare variable $uri as xs:string external; declare variable $version_uri as xs:string? external; declare variable $img_location as xs:string? external; declare variable $xsl_filename as xs:string? external; +declare variable $query as xs:string? external; let $judgment_published_property := xdmp:document-get-properties($uri, xs:QName("published"))[1] let $is_published := $judgment_published_property/text() @@ -14,6 +17,27 @@ let $xsl_path := fn:concat("judgments/xslts/", $xsl_filename) let $params := map:map() +let $number_marks_xslt := ( + + + + + + + + + + + + mark_ + + + + + + +) (: change the image-base of the document to match the location of the assets in $image_base so that references to images point to the correct places on the internet :) let $_put := map:put( @@ -26,7 +50,7 @@ let $_ := if (not(exists($document_to_transform))) then fn:error(xs:QName("FCL_DOCUMENTNOTFOUND"), "No XML document was found to transform") ) else () -let $return_value := if (xs:boolean($is_published) or $show_unpublished) then +let $retrieved_value := if (xs:boolean($is_published) or $show_unpublished) then xdmp:xslt-invoke($xsl_path, $document_to_transform, $params @@ -34,4 +58,16 @@ let $return_value := if (xs:boolean($is_published) or $show_unpublished) then else () +let $return_value := if($query) then + xdmp:xslt-eval( + $number_marks_xslt, + cts:highlight( + $retrieved_value, + helper:make-q-query($query), + {$cts:text} + ) + ) + else + $retrieved_value + return $return_value diff --git a/src/caselawclient/xquery_type_dicts.py b/src/caselawclient/xquery_type_dicts.py index 8fea3b45..10547480 100644 --- a/src/caselawclient/xquery_type_dicts.py +++ b/src/caselawclient/xquery_type_dicts.py @@ -192,6 +192,7 @@ class XsltDict(MarkLogicAPIDict): # xslt_transform.xqy class XsltTransformDict(MarkLogicAPIDict): img_location: Optional[str] + query: Optional[str] show_unpublished: Optional[bool] uri: MarkLogicDocumentURIString version_uri: Optional[MarkLogicDocumentVersionURIString] diff --git a/tests/client/test_eval_xslt.py b/tests/client/test_eval_xslt.py index 87972db6..5984d5e8 100644 --- a/tests/client/test_eval_xslt.py +++ b/tests/client/test_eval_xslt.py @@ -5,6 +5,7 @@ from unittest.mock import patch from caselawclient.Client import ROOT_DIR, MarklogicApiClient +from caselawclient.xquery_type_dicts import XsltTransformDict class TestEvalXslt(unittest.TestCase): @@ -18,12 +19,13 @@ def test_eval_xslt_user_can_view_unpublished(self): self.client, "user_can_view_unpublished_judgments", return_value=True ): uri = "/judgment/uri" - expected_vars = { + expected_vars: XsltTransformDict = { "uri": "/judgment/uri.xml", "version_uri": None, "show_unpublished": True, "img_location": "imagepath", "xsl_filename": "accessible-html.xsl", + "query": None, } self.client.eval_xslt(uri, show_unpublished=True) @@ -44,12 +46,13 @@ def test_eval_xslt_user_cannot_view_unpublished(self): ): with patch.object(logging, "warning") as mock_logging: uri = "/judgment/uri" - expected_vars = { + expected_vars: XsltTransformDict = { "uri": "/judgment/uri.xml", "version_uri": None, "show_unpublished": False, "img_location": "imagepath", "xsl_filename": "accessible-html.xsl", + "query": None, } self.client.eval_xslt(uri, show_unpublished=True) @@ -68,12 +71,13 @@ def test_eval_xslt_with_filename(self): self.client, "user_can_view_unpublished_judgments", return_value=True ): uri = "/judgment/uri" - expected_vars = { + expected_vars: XsltTransformDict = { "uri": "/judgment/uri.xml", "version_uri": None, "show_unpublished": True, "img_location": "imagepath", "xsl_filename": "as-handed-down.xsl", + "query": None, } self.client.eval_xslt( uri, show_unpublished=True, xsl_filename="as-handed-down.xsl" @@ -83,3 +87,32 @@ def test_eval_xslt_with_filename(self): os.path.join(ROOT_DIR, "xquery", "xslt_transform.xqy") ) assert mock_eval.call_args.kwargs["vars"] == json.dumps(expected_vars) + + @patch.dict(os.environ, {"XSLT_IMAGE_LOCATION": "imagepath"}, clear=True) + def test_eval_xslt_with_query(self): + with patch.object(self.client, "eval") as mock_eval: + with patch.object( + self.client, "user_can_view_unpublished_judgments", return_value=True + ): + uri = "/judgment/uri" + query = "the query string" + expected_vars: XsltTransformDict = { + "uri": "/judgment/uri.xml", + "version_uri": None, + "show_unpublished": True, + "img_location": "imagepath", + "xsl_filename": "as-handed-down.xsl", + "query": query, + } + self.client.eval_xslt( + uri, + show_unpublished=True, + xsl_filename="as-handed-down.xsl", + query=query, + ) + + assert mock_eval.call_args.args[0] == ( + os.path.join(ROOT_DIR, "xquery", "xslt_transform.xqy") + ) + + assert mock_eval.call_args.kwargs["vars"] == json.dumps(expected_vars) diff --git a/tests/models/test_documents.py b/tests/models/test_documents.py index d113b1f8..0e554f72 100644 --- a/tests/models/test_documents.py +++ b/tests/models/test_documents.py @@ -20,6 +20,7 @@ DocumentNotSafeForDeletion, UnparsableDate, ) +from tests.test_helpers import MockMultipartResponse @pytest.fixture @@ -210,6 +211,38 @@ def test_document_version_number_when_is_version(self, mock_api_client): assert version_document.version_number == 9 assert version_document.is_version + def test_number_of_mentions_when_no_mentions(self, mock_api_client): + mock_api_client.eval_xslt.return_value = MockMultipartResponse( + """ +
+

An article with no mark elements.

+
+ """.encode( + "utf-8" + ) + ) + + document = Document("test/1234", mock_api_client) + + assert document.number_of_mentions("some") == 0 + + def test_number_of_mentions_when_mentions(self, mock_api_client): + mock_api_client.eval_xslt.return_value = MockMultipartResponse( + """ +
+

+ An article with some mark elements, and some more. +

+
+ """.encode( + "utf-8" + ) + ) + + document = Document("test/1234", mock_api_client) + + assert document.number_of_mentions("some") == 2 + class TestDocumentValidation: def test_judgment_is_failure(self, mock_api_client): diff --git a/tests/test_helpers.py b/tests/test_helpers.py new file mode 100644 index 00000000..b0316c5f --- /dev/null +++ b/tests/test_helpers.py @@ -0,0 +1,8 @@ +from requests_toolbelt.multipart import encoder + + +class MockMultipartResponse: + def __init__(self, text): + multipart = encoder.MultipartEncoder({"content": text}) + self.content = multipart.to_string() + self.headers = {"content-type": multipart.content_type}