From 5a50e75dd8219eb0bb619008a0136572d813b711 Mon Sep 17 00:00:00 2001 From: David McKee Date: Fri, 6 Dec 2024 15:26:12 +0000 Subject: [PATCH] Resolve identifier URIs to MarkLogic URIs --- script/build_xquery_type_dicts | 2 +- src/caselawclient/Client.py | 16 +++++++ src/caselawclient/identifier_resolution.py | 43 +++++++++++++++++++ .../xquery/resolve_from_identifier.xqy | 17 ++++++++ src/caselawclient/xquery_type_dicts.py | 6 +++ tests/client/test_identifier_resolution.py | 31 +++++++++++++ 6 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 src/caselawclient/identifier_resolution.py create mode 100644 src/caselawclient/xquery/resolve_from_identifier.xqy create mode 100644 tests/client/test_identifier_resolution.py diff --git a/script/build_xquery_type_dicts b/script/build_xquery_type_dicts index 98edd49e..8b7a5c2e 100755 --- a/script/build_xquery_type_dicts +++ b/script/build_xquery_type_dicts @@ -54,7 +54,7 @@ def ml_type_to_python_type_declaration(variable_name: str, variable_type: str): variable_type = "MarkLogicDocumentVersionURIString" elif variable_name == "privilege_uri": variable_type = "MarkLogicPrivilegeURIString" - elif variable_name == "parent_uri": + elif variable_name in ["identifier_uri", "parent_uri"]: variable_type = "DocumentURIString" elif variable_name == "uri" or variable_name.endswith("_uri"): variable_type = "MarkLogicDocumentURIString" diff --git a/src/caselawclient/Client.py b/src/caselawclient/Client.py index 40a1b3ca..10c2c86b 100644 --- a/src/caselawclient/Client.py +++ b/src/caselawclient/Client.py @@ -20,6 +20,7 @@ from caselawclient import xquery_type_dicts as query_dicts from caselawclient.client_helpers import VersionAnnotation +from caselawclient.identifier_resolution import IdentifierResolutions from caselawclient.models.documents import ( DOCUMENT_COLLECTION_URI_JUDGMENT, DOCUMENT_COLLECTION_URI_PRESS_SUMMARY, @@ -1201,3 +1202,18 @@ def get_recently_parsed( ) return results + + def resolve_from_identifier(self, identifier_uri: str, published_only: bool = True) -> IdentifierResolutions: + """Given a PUI/EUI url, look up the precomputed slug and return the + MarkLogic document URIs which match that slug. Multiple returns should be anticipated""" + vars: query_dicts.ResolveFromIdentifierDict = { + "identifier_uri": DocumentURIString(identifier_uri), + "published_only": int(published_only), + } + raw_results: list[str] = get_multipart_strings_from_marklogic_response( + self._send_to_eval( + vars, + "resolve_from_identifier.xqy", + ), + ) + return IdentifierResolutions.from_marklogic_output(raw_results) diff --git a/src/caselawclient/identifier_resolution.py b/src/caselawclient/identifier_resolution.py new file mode 100644 index 00000000..afa70d51 --- /dev/null +++ b/src/caselawclient/identifier_resolution.py @@ -0,0 +1,43 @@ +import json +from typing import NamedTuple + +from caselawclient.models.documents import DocumentURIString +from caselawclient.xquery_type_dicts import MarkLogicDocumentURIString + + +class IdentifierResolutions(list["IdentifierResolution"]): + """ + A list of candidate MarkLogic documents which correspond to a Public UI uri + + MarkLogic returns a list of dictionaries; IdentifierResolution handles a single dictionary + which corresponds to a single identifier to MarkLogic document mapping. + + see `xquery/resolve_from_identifier.xqy` and `resolve_from_identifier` in `Client.py` + """ + + @staticmethod + def from_marklogic_output(table: list[str]) -> "IdentifierResolutions": + return IdentifierResolutions(list(IdentifierResolution.from_marklogic_output(row) for row in table)) + + def published(self) -> "IdentifierResolutions": + "Filter the list so that only published documents are returned" + return IdentifierResolutions(list(x for x in self if x.document_published)) + + +class IdentifierResolution(NamedTuple): + """A single response from MarkLogic about a single identifier / document mapping""" + + identifier_uuid: str + document_uri: MarkLogicDocumentURIString + identifier_slug: DocumentURIString + document_published: bool + + @staticmethod + def from_marklogic_output(raw_row: str) -> "IdentifierResolution": + row = json.loads(raw_row) + return IdentifierResolution( + identifier_uuid=row["documents.compiled_url_slugs.identifier_uuid"], + document_uri=MarkLogicDocumentURIString(row["documents.compiled_url_slugs.document_uri"]), + identifier_slug=DocumentURIString(row["documents.compiled_url_slugs.identifier_slug"]), + document_published=row["documents.compiled_url_slugs.document_published"] == "true", + ) diff --git a/src/caselawclient/xquery/resolve_from_identifier.xqy b/src/caselawclient/xquery/resolve_from_identifier.xqy new file mode 100644 index 00000000..cf2d5c1a --- /dev/null +++ b/src/caselawclient/xquery/resolve_from_identifier.xqy @@ -0,0 +1,17 @@ +xquery version "1.0-ml"; + +declare namespace xdmp="http://marklogic.com/xdmp"; +declare variable $identifier_uri as xs:string external; +declare variable $published_only as xs:int? external := 1; + +let $published_query := if ($published_only) then " AND document_published = 'true'" else "" +let $query := "SELECT * from compiled_url_slugs WHERE (identifier_slug = @uri)" || $published_query + +return xdmp:sql( + $query, + "map", + map:new(( + map:entry("uri", $identifier_uri) + )) +) + diff --git a/src/caselawclient/xquery_type_dicts.py b/src/caselawclient/xquery_type_dicts.py index 4b3cab17..3f741434 100644 --- a/src/caselawclient/xquery_type_dicts.py +++ b/src/caselawclient/xquery_type_dicts.py @@ -141,6 +141,12 @@ class ListJudgmentVersionsDict(MarkLogicAPIDict): uri: MarkLogicDocumentURIString +# resolve_from_identifier.xqy +class ResolveFromIdentifierDict(MarkLogicAPIDict): + identifier_uri: DocumentURIString + published_only: Optional[int] + + # set_boolean_property.xqy class SetBooleanPropertyDict(MarkLogicAPIDict): name: str diff --git a/tests/client/test_identifier_resolution.py b/tests/client/test_identifier_resolution.py new file mode 100644 index 00000000..67d7093a --- /dev/null +++ b/tests/client/test_identifier_resolution.py @@ -0,0 +1,31 @@ +from caselawclient.identifier_resolution import IdentifierResolutions + +raw_marklogic_resolutions = [ + """ + {"documents.compiled_url_slugs.identifier_uuid":"24b9a384-8bcf-4f20-996a-5c318f8dc657", + "documents.compiled_url_slugs.document_uri":"/ewca/civ/2003/547.xml", + "documents.compiled_url_slugs.identifier_slug":"ewca/civ/2003/54721", + "documents.compiled_url_slugs.document_published":"false"} + """, + """ + {"documents.compiled_url_slugs.identifier_uuid":"x", + "documents.compiled_url_slugs.document_uri":"x", + "documents.compiled_url_slugs.identifier_slug":"x", + "documents.compiled_url_slugs.document_published":"true"} + """, +] + + +def test_decoded_identifier(): + decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions) + res = decoded_resolutions[0] + assert res.identifier_uuid == "24b9a384-8bcf-4f20-996a-5c318f8dc657" + assert res.document_uri == "/ewca/civ/2003/547.xml" + assert res.identifier_slug == "ewca/civ/2003/54721" + assert res.document_published == False # noqa: E712 + + +def test_published(): + decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions) + assert len(decoded_resolutions.published()) == 1 + assert decoded_resolutions.published()[0] == decoded_resolutions[1]