Skip to content

Commit

Permalink
Resolve identifier URIs to MarkLogic URIs
Browse files Browse the repository at this point in the history
  • Loading branch information
dragon-dxw committed Dec 9, 2024
1 parent b2f5aa7 commit 5a50e75
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 1 deletion.
2 changes: 1 addition & 1 deletion script/build_xquery_type_dicts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def ml_type_to_python_type_declaration(variable_name: str, variable_type: str):
variable_type = "MarkLogicDocumentVersionURIString"
elif variable_name == "privilege_uri":
variable_type = "MarkLogicPrivilegeURIString"
elif variable_name == "parent_uri":
elif variable_name in ["identifier_uri", "parent_uri"]:
variable_type = "DocumentURIString"
elif variable_name == "uri" or variable_name.endswith("_uri"):
variable_type = "MarkLogicDocumentURIString"
Expand Down
16 changes: 16 additions & 0 deletions src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from caselawclient import xquery_type_dicts as query_dicts
from caselawclient.client_helpers import VersionAnnotation
from caselawclient.identifier_resolution import IdentifierResolutions
from caselawclient.models.documents import (
DOCUMENT_COLLECTION_URI_JUDGMENT,
DOCUMENT_COLLECTION_URI_PRESS_SUMMARY,
Expand Down Expand Up @@ -1201,3 +1202,18 @@ def get_recently_parsed(
)

return results

def resolve_from_identifier(self, identifier_uri: str, published_only: bool = True) -> IdentifierResolutions:
"""Given a PUI/EUI url, look up the precomputed slug and return the
MarkLogic document URIs which match that slug. Multiple returns should be anticipated"""
vars: query_dicts.ResolveFromIdentifierDict = {
"identifier_uri": DocumentURIString(identifier_uri),
"published_only": int(published_only),
}
raw_results: list[str] = get_multipart_strings_from_marklogic_response(
self._send_to_eval(
vars,
"resolve_from_identifier.xqy",
),
)
return IdentifierResolutions.from_marklogic_output(raw_results)
43 changes: 43 additions & 0 deletions src/caselawclient/identifier_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json
from typing import NamedTuple

from caselawclient.models.documents import DocumentURIString
from caselawclient.xquery_type_dicts import MarkLogicDocumentURIString


class IdentifierResolutions(list["IdentifierResolution"]):
"""
A list of candidate MarkLogic documents which correspond to a Public UI uri
MarkLogic returns a list of dictionaries; IdentifierResolution handles a single dictionary
which corresponds to a single identifier to MarkLogic document mapping.
see `xquery/resolve_from_identifier.xqy` and `resolve_from_identifier` in `Client.py`
"""

@staticmethod
def from_marklogic_output(table: list[str]) -> "IdentifierResolutions":
return IdentifierResolutions(list(IdentifierResolution.from_marklogic_output(row) for row in table))

def published(self) -> "IdentifierResolutions":
"Filter the list so that only published documents are returned"
return IdentifierResolutions(list(x for x in self if x.document_published))


class IdentifierResolution(NamedTuple):
"""A single response from MarkLogic about a single identifier / document mapping"""

identifier_uuid: str
document_uri: MarkLogicDocumentURIString
identifier_slug: DocumentURIString
document_published: bool

@staticmethod
def from_marklogic_output(raw_row: str) -> "IdentifierResolution":
row = json.loads(raw_row)
return IdentifierResolution(
identifier_uuid=row["documents.compiled_url_slugs.identifier_uuid"],
document_uri=MarkLogicDocumentURIString(row["documents.compiled_url_slugs.document_uri"]),
identifier_slug=DocumentURIString(row["documents.compiled_url_slugs.identifier_slug"]),
document_published=row["documents.compiled_url_slugs.document_published"] == "true",
)
17 changes: 17 additions & 0 deletions src/caselawclient/xquery/resolve_from_identifier.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
xquery version "1.0-ml";

declare namespace xdmp="http://marklogic.com/xdmp";
declare variable $identifier_uri as xs:string external;
declare variable $published_only as xs:int? external := 1;

let $published_query := if ($published_only) then " AND document_published = 'true'" else ""
let $query := "SELECT * from compiled_url_slugs WHERE (identifier_slug = @uri)" || $published_query

return xdmp:sql(
$query,
"map",
map:new((
map:entry("uri", $identifier_uri)
))
)

6 changes: 6 additions & 0 deletions src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ class ListJudgmentVersionsDict(MarkLogicAPIDict):
uri: MarkLogicDocumentURIString


# resolve_from_identifier.xqy
class ResolveFromIdentifierDict(MarkLogicAPIDict):
identifier_uri: DocumentURIString
published_only: Optional[int]


# set_boolean_property.xqy
class SetBooleanPropertyDict(MarkLogicAPIDict):
name: str
Expand Down
31 changes: 31 additions & 0 deletions tests/client/test_identifier_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from caselawclient.identifier_resolution import IdentifierResolutions

raw_marklogic_resolutions = [
"""
{"documents.compiled_url_slugs.identifier_uuid":"24b9a384-8bcf-4f20-996a-5c318f8dc657",
"documents.compiled_url_slugs.document_uri":"/ewca/civ/2003/547.xml",
"documents.compiled_url_slugs.identifier_slug":"ewca/civ/2003/54721",
"documents.compiled_url_slugs.document_published":"false"}
""",
"""
{"documents.compiled_url_slugs.identifier_uuid":"x",
"documents.compiled_url_slugs.document_uri":"x",
"documents.compiled_url_slugs.identifier_slug":"x",
"documents.compiled_url_slugs.document_published":"true"}
""",
]


def test_decoded_identifier():
decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions)
res = decoded_resolutions[0]
assert res.identifier_uuid == "24b9a384-8bcf-4f20-996a-5c318f8dc657"
assert res.document_uri == "/ewca/civ/2003/547.xml"
assert res.identifier_slug == "ewca/civ/2003/54721"
assert res.document_published == False # noqa: E712


def test_published():
decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions)
assert len(decoded_resolutions.published()) == 1
assert decoded_resolutions.published()[0] == decoded_resolutions[1]

0 comments on commit 5a50e75

Please sign in to comment.