Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve identifier URIs to MarkLogic URIs #794

Merged
merged 1 commit into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion script/build_xquery_type_dicts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def ml_type_to_python_type_declaration(variable_name: str, variable_type: str):
variable_type = "MarkLogicDocumentVersionURIString"
elif variable_name == "privilege_uri":
variable_type = "MarkLogicPrivilegeURIString"
elif variable_name == "parent_uri":
elif variable_name in ["identifier_uri", "parent_uri"]:
variable_type = "DocumentURIString"
elif variable_name == "uri" or variable_name.endswith("_uri"):
variable_type = "MarkLogicDocumentURIString"
Expand Down
16 changes: 16 additions & 0 deletions src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from caselawclient import xquery_type_dicts as query_dicts
from caselawclient.client_helpers import VersionAnnotation
from caselawclient.identifier_resolution import IdentifierResolutions
from caselawclient.models.documents import (
DOCUMENT_COLLECTION_URI_JUDGMENT,
DOCUMENT_COLLECTION_URI_PRESS_SUMMARY,
Expand Down Expand Up @@ -1201,3 +1202,18 @@ def get_recently_parsed(
)

return results

def resolve_from_identifier(self, identifier_uri: str, published_only: bool = True) -> IdentifierResolutions:
"""Given a PUI/EUI url, look up the precomputed slug and return the
MarkLogic document URIs which match that slug. Multiple returns should be anticipated"""
vars: query_dicts.ResolveFromIdentifierDict = {
"identifier_uri": DocumentURIString(identifier_uri),
"published_only": int(published_only),
}
raw_results: list[str] = get_multipart_strings_from_marklogic_response(
self._send_to_eval(
vars,
"resolve_from_identifier.xqy",
),
)
return IdentifierResolutions.from_marklogic_output(raw_results)
43 changes: 43 additions & 0 deletions src/caselawclient/identifier_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit conflicted about where this file lives -- I think this is better than being in models.identifiers. Maybe it should be in models.identifier_resolution ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this isn't a data model - current location seems fine to me.

from typing import NamedTuple

from caselawclient.models.documents import DocumentURIString
from caselawclient.xquery_type_dicts import MarkLogicDocumentURIString


class IdentifierResolutions(list["IdentifierResolution"]):
"""
A list of candidate MarkLogic documents which correspond to a Public UI uri

MarkLogic returns a list of dictionaries; IdentifierResolution handles a single dictionary
which corresponds to a single identifier to MarkLogic document mapping.

see `xquery/resolve_from_identifier.xqy` and `resolve_from_identifier` in `Client.py`
"""

@staticmethod
def from_marklogic_output(table: list[str]) -> "IdentifierResolutions":
return IdentifierResolutions(list(IdentifierResolution.from_marklogic_output(row) for row in table))

def published(self) -> "IdentifierResolutions":
"Filter the list so that only published documents are returned"
return IdentifierResolutions(list(x for x in self if x.document_published))


class IdentifierResolution(NamedTuple):
dragon-dxw marked this conversation as resolved.
Show resolved Hide resolved
"""A single response from MarkLogic about a single identifier / document mapping"""

identifier_uuid: str
document_uri: MarkLogicDocumentURIString
identifier_slug: DocumentURIString
document_published: bool

@staticmethod
def from_marklogic_output(raw_row: str) -> "IdentifierResolution":
row = json.loads(raw_row)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be a candidate for refactoring in future if we do much more with TDE; possibly one to touch on when we improve how EUI reports on things in various pending states, or when we do exception reporting.

return IdentifierResolution(
identifier_uuid=row["documents.compiled_url_slugs.identifier_uuid"],
document_uri=MarkLogicDocumentURIString(row["documents.compiled_url_slugs.document_uri"]),
identifier_slug=DocumentURIString(row["documents.compiled_url_slugs.identifier_slug"]),
document_published=row["documents.compiled_url_slugs.document_published"] == "true",
)
17 changes: 17 additions & 0 deletions src/caselawclient/xquery/resolve_from_identifier.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
xquery version "1.0-ml";

declare namespace xdmp="http://marklogic.com/xdmp";
declare variable $identifier_uri as xs:string external;
declare variable $published_only as xs:int? external := 1;

let $published_query := if ($published_only) then " AND document_published = 'true'" else ""
let $query := "SELECT * from compiled_url_slugs WHERE (identifier_slug = @uri)" || $published_query

return xdmp:sql(
$query,
"map",
map:new((
map:entry("uri", $identifier_uri)
))
)

6 changes: 6 additions & 0 deletions src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ class ListJudgmentVersionsDict(MarkLogicAPIDict):
uri: MarkLogicDocumentURIString


# resolve_from_identifier.xqy
class ResolveFromIdentifierDict(MarkLogicAPIDict):
identifier_uri: DocumentURIString
published_only: Optional[int]


# set_boolean_property.xqy
class SetBooleanPropertyDict(MarkLogicAPIDict):
name: str
Expand Down
31 changes: 31 additions & 0 deletions tests/client/test_identifier_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from caselawclient.identifier_resolution import IdentifierResolutions

raw_marklogic_resolutions = [
"""
{"documents.compiled_url_slugs.identifier_uuid":"24b9a384-8bcf-4f20-996a-5c318f8dc657",
"documents.compiled_url_slugs.document_uri":"/ewca/civ/2003/547.xml",
"documents.compiled_url_slugs.identifier_slug":"ewca/civ/2003/54721",
"documents.compiled_url_slugs.document_published":"false"}
""",
"""
{"documents.compiled_url_slugs.identifier_uuid":"x",
"documents.compiled_url_slugs.document_uri":"x",
"documents.compiled_url_slugs.identifier_slug":"x",
"documents.compiled_url_slugs.document_published":"true"}
""",
]


def test_decoded_identifier():
decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions)
res = decoded_resolutions[0]
assert res.identifier_uuid == "24b9a384-8bcf-4f20-996a-5c318f8dc657"
assert res.document_uri == "/ewca/civ/2003/547.xml"
assert res.identifier_slug == "ewca/civ/2003/54721"
assert res.document_published == False # noqa: E712


def test_published():
decoded_resolutions = IdentifierResolutions.from_marklogic_output(raw_marklogic_resolutions)
assert len(decoded_resolutions.published()) == 1
assert decoded_resolutions.published()[0] == decoded_resolutions[1]
Loading