Skip to content

Commit

Permalink
Merge pull request #778 from nationalarchives/FCL-309-implement-frame…
Browse files Browse the repository at this point in the history
…work-for-storing-multiple-identifiers-against-documents

[FCL-309] Add the ability for documents to store abstract identifiers in MarkLogic properties
  • Loading branch information
jacksonj04 authored Dec 3, 2024
2 parents 427e141 + deb72ae commit 1dc5934
Show file tree
Hide file tree
Showing 15 changed files with 499 additions and 0 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@ The format is based on [Keep a Changelog 1.0.0].

## Unreleased

### Feat

- **FCL-309**: identifiers can compile URL slugs
- **FCL-309**: identifiers can now be saved to and retrieved from MarkLogic
- **FCL-309**: add functionality for packing and unpacking XML representations of identifiers
- **FCL-309**: add stub for defining identifier schemas, and a Neutral Citation schema

### Fix

- **deps**: update dependency ds-caselaw-utils to v2.0.1
- **deps**: update dependency mypy-boto3-sns to v1.35.68
- **deps**: update boto packages to v1.35.67
- **deps**: update dependency boto3 to v1.35.64
- **deps**: update boto packages to v1.35.61

## v28.0.0 (2024-11-14)
Expand Down
28 changes: 28 additions & 0 deletions src/caselawclient/Client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import environ
import requests
from ds_caselaw_utils.types import NeutralCitationString
from lxml import etree
from requests.auth import HTTPBasicAuth
from requests.structures import CaseInsensitiveDict
from requests_toolbelt.multipart import decoder
Expand Down Expand Up @@ -864,6 +865,17 @@ def get_property(self, judgment_uri: DocumentURIString, name: str) -> str:
}
return self._eval_and_decode(vars, "get_property.xqy")

def get_property_as_node(self, judgment_uri: DocumentURIString, name: str) -> Optional[etree._Element]:
uri = self._format_uri_for_marklogic(judgment_uri)
vars: query_dicts.GetPropertyAsNodeDict = {
"uri": uri,
"name": name,
}
value = self._eval_and_decode(vars, "get_property_as_node.xqy")
if not value:
return None
return etree.fromstring(value)

def get_version_annotation(self, judgment_uri: DocumentURIString) -> str:
uri = self._format_uri_for_marklogic(judgment_uri)
vars: query_dicts.GetVersionAnnotationDict = {
Expand Down Expand Up @@ -896,6 +908,22 @@ def set_property(

return self._send_to_eval(vars, "set_property.xqy")

def set_property_as_node(
self,
judgment_uri: DocumentURIString,
name: str,
value: etree._Element,
) -> requests.Response:
"""Given a root node, set the value of the MarkLogic property for a document to the _contents_ of that root node. The root node itself is discarded."""
uri = self._format_uri_for_marklogic(judgment_uri)
vars: query_dicts.SetPropertyAsNodeDict = {
"uri": uri,
"value": etree.tostring(value).decode(),
"name": name,
}

return self._send_to_eval(vars, "set_property_as_node.xqy")

def set_boolean_property(
self,
judgment_uri: DocumentURIString,
Expand Down
1 change: 1 addition & 0 deletions src/caselawclient/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def build(
if not api_client:
api_client = Mock(spec=MarklogicApiClient)
api_client.get_judgment_xml_bytestring.return_value = DEFAULT_DOCUMENT_BODY_XML.encode(encoding="utf-8")
api_client.get_property_as_node.return_value = None

document = cls.target_class(uri, api_client=api_client)
document.content_as_html = Mock(return_value=html) # type: ignore[method-assign]
Expand Down
44 changes: 44 additions & 0 deletions src/caselawclient/models/documents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ds_caselaw_utils import courts
from ds_caselaw_utils.courts import CourtNotFoundException
from ds_caselaw_utils.types import NeutralCitationString
from lxml import etree
from lxml import html as html_parser
from requests_toolbelt.multipart import decoder

Expand All @@ -15,6 +16,8 @@
NotSupportedOnVersion,
OnlySupportedOnVersion,
)
from caselawclient.models.identifiers import Identifier
from caselawclient.models.identifiers.unpacker import unpack_identifier_from_etree
from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
from caselawclient.models.utilities.aws import (
ParserInstructionsDict,
Expand Down Expand Up @@ -124,6 +127,8 @@ class Document:
Individual document classes should extend this list where necessary to validate document type-specific attributes.
"""

_identifiers: dict[str, Identifier]

def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None):
"""
:param uri: The URI of the document to retrieve from MarkLogic.
Expand All @@ -146,6 +151,8 @@ def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", sea
)
""" `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """

self._initialise_identifiers()

def __repr__(self) -> str:
name = self.body.name or "un-named"
return f"<{self.document_noun} {self.uri}: {name}>"
Expand All @@ -160,6 +167,43 @@ def docx_exists(self) -> bool:
"""There is a docx in S3 private bucket for this Document"""
return check_docx_exists(self.uri)

def _initialise_identifiers(self) -> None:
"""Load this document's identifiers from MarkLogic."""

self._identifiers = {}

identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers")

if identifiers_element_as_etree is not None:
for identifier_etree in identifiers_element_as_etree.findall("identifier"):
identifier = unpack_identifier_from_etree(identifier_etree)
self.add_identifier(identifier)

@property
def identifiers(self) -> list[Identifier]:
"""Return a list of Identifier objects for easy display and interaction."""
return list(self._identifiers.values())

def add_identifier(self, identifier: Identifier) -> None:
"""Add an Identifier object to this Document's list of identifiers."""

self._identifiers[identifier.uuid] = identifier

@property
def identifiers_as_etree(self) -> etree._Element:
"""Return an etree representation of all the Document's identifiers."""
identifiers_root = etree.Element("identifiers")

for identifier in self.identifiers:
identifiers_root.append(identifier.as_xml_tree)

return identifiers_root

def save_identifiers(self) -> None:
"""Save the current state of this Document's identifiers to MarkLogic."""

self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers_as_etree)

@property
def best_human_identifier(self) -> Optional[str]:
"""
Expand Down
98 changes: 98 additions & 0 deletions src/caselawclient/models/identifiers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from abc import ABC, abstractmethod
from typing import Any, Optional
from uuid import uuid4

from lxml import etree

IDENTIFIER_PACKABLE_ATTRIBUTES: list[str] = [
"uuid",
"value",
"url_slug",
]

IDENTIFIER_UNPACKABLE_ATTRIBUTES: list[str] = [
"uuid",
"value",
]


class InvalidIdentifierXMLRepresentationException(Exception):
pass


class IdentifierSchema(ABC):
"""
A base class which describes what an identifier schema should look like.
"""

name: str
namespace: str

def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None:
"""Ensure that subclasses have the required attributes set."""
for required in (
"name",
"namespace",
):
if not getattr(cls, required, False):
raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.")
super().__init_subclass__(**kwargs)

def __repr__(self) -> str:
return self.name

@classmethod
@abstractmethod
def validate_identifier(cls, value: str) -> bool:
"""Check that any given identifier value is valid for this schema."""
pass

@classmethod
@abstractmethod
def compile_identifier_url_slug(cls, value: str) -> str:
"""Convert an identifier into a precompiled URL slug."""
pass


class Identifier(ABC):
"""A base class for subclasses representing a concrete identifier."""

schema: type[IdentifierSchema]

uuid: str
value: str

def __init_subclass__(cls: type["Identifier"], **kwargs: Any) -> None:
"""Ensure that subclasses have the required attributes set."""
for required in ("schema",):
if not getattr(cls, required, False):
raise NotImplementedError(f"Can't instantiate Identifier without {required} attribute.")
super().__init_subclass__(**kwargs)

def __repr__(self) -> str:
return f"{self.uuid} ({self.schema.name}): {self.value}"

def __init__(self, value: str, uuid: Optional[str] = None) -> None:
self.value = value
if uuid:
self.uuid = uuid
else:
self.uuid = "id-" + str(uuid4())

@property
def as_xml_tree(self) -> etree._Element:
"""Convert this Identifier into a packed XML representation for storage."""
identifier_root = etree.Element("identifier")

namespace_attribute = etree.SubElement(identifier_root, "namespace")
namespace_attribute.text = self.schema.namespace

for attribute in IDENTIFIER_PACKABLE_ATTRIBUTES:
packed_attribute = etree.SubElement(identifier_root, attribute)
packed_attribute.text = getattr(self, attribute)

return identifier_root

@property
def url_slug(self) -> str:
return self.schema.compile_identifier_url_slug(self.value)
49 changes: 49 additions & 0 deletions src/caselawclient/models/identifiers/neutral_citation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import re

from ds_caselaw_utils import neutral_url
from ds_caselaw_utils.types import NeutralCitationString

from . import Identifier, IdentifierSchema

VALID_NCN_PATTERN = re.compile(r"(^\[([0-9]{4})\] ([a-zA-Z]+)(?: ([a-zA-Z]+))? ([0-9]+)(?: \(([a-zA-Z]+)\))?$)")
"""
This is a catch-all pattern for anything which looks like a Neutral Citation, even if the court itself isn't valid. Checking that an NCN is plausibly correct is handled elsewhere.
This pattern also defines five capture groups to standardise how we interface with the elements:
- `0`: The year of the decision
- `1`: The court
- `2`: (Optionally) the jurisdiction or division, depending on the court
- `3`: The sequence number of the decision
- `4`: (Optionally) the jurisdiction or division, depending on the court
TODO: When these capture groups are being used in anger (eg to build URL slugs) you should go through and name the groups.
"""


class NeutralCitationNumberSchema(IdentifierSchema):
"""
Identifier schema describing a Neutral Citation Number.
https://www.iclr.co.uk/knowledge/case-law/neutral-citations/
"""

name = "Neutral Citation Number"
namespace = "ukncn"

@classmethod
def validate_identifier(cls, value: str) -> bool:
return bool(VALID_NCN_PATTERN.match(value))

@classmethod
def compile_identifier_url_slug(cls, value: str) -> str:
ncn_based_uri_string = neutral_url(
NeutralCitationString(value)
) # TODO: At some point this should move out of utils and into this class.
if not ncn_based_uri_string:
raise Exception(f"Unable to convert NCN {value} into NCN-based URL slug")
return ncn_based_uri_string


class NeutralCitationNumber(Identifier):
schema = NeutralCitationNumberSchema
31 changes: 31 additions & 0 deletions src/caselawclient/models/identifiers/unpacker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from lxml import etree

from . import IDENTIFIER_UNPACKABLE_ATTRIBUTES, Identifier, InvalidIdentifierXMLRepresentationException
from .neutral_citation import NeutralCitationNumber

IDENTIFIER_NAMESPACE_MAP: dict[str, type[Identifier]] = {
"ukncn": NeutralCitationNumber,
}


def unpack_identifier_from_etree(identifier_xml: etree._Element) -> Identifier:
"""Given an etree representation of an identifier, unpack it into an appropriate instance of an Identifier."""

namespace_element = identifier_xml.find("namespace")

if namespace_element is None or not namespace_element.text:
raise InvalidIdentifierXMLRepresentationException(
"Identifer XML representation is not valid: namespace not present or empty"
)

kwargs: dict[str, str] = {}

for attribute in IDENTIFIER_UNPACKABLE_ATTRIBUTES:
element = identifier_xml.find(attribute)
if element is None or not element.text:
raise InvalidIdentifierXMLRepresentationException(
f"Identifer XML representation is not valid: {element} not present or empty"
)
kwargs[attribute] = element.text

return IDENTIFIER_NAMESPACE_MAP[namespace_element.text](**kwargs)
9 changes: 9 additions & 0 deletions src/caselawclient/xquery/get_property_as_node.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
xquery version "1.0-ml";

declare variable $uri as xs:string external;

declare variable $name as xs:string external;

let $prop := fn:QName("", $name)

return xdmp:document-get-properties($uri, $prop)
11 changes: 11 additions & 0 deletions src/caselawclient/xquery/set_property_as_node.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
xquery version "1.0-ml";

import module namespace dls = "http://marklogic.com/xdmp/dls" at "/MarkLogic/dls.xqy";

declare variable $uri as xs:string external;
declare variable $value as xs:string external;
declare variable $name as xs:string external;

let $props := ( element {$name} {xdmp:unquote($value)/*/*} )

return dls:document-set-property($uri, $props)
13 changes: 13 additions & 0 deletions src/caselawclient/xquery_type_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ class GetPropertyDict(MarkLogicAPIDict):
uri: MarkLogicDocumentURIString


# get_property_as_node.xqy
class GetPropertyAsNodeDict(MarkLogicAPIDict):
name: str
uri: MarkLogicDocumentURIString


# get_version_annotation.xqy
class GetVersionAnnotationDict(MarkLogicAPIDict):
uri: MarkLogicDocumentURIString
Expand Down Expand Up @@ -187,6 +193,13 @@ class SetPropertyDict(MarkLogicAPIDict):
value: str


# set_property_as_node.xqy
class SetPropertyAsNodeDict(MarkLogicAPIDict):
name: str
uri: MarkLogicDocumentURIString
value: str


# update_document.xqy
class UpdateDocumentDict(MarkLogicAPIDict):
annotation: str
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,6 @@ def _generate_mock_response(search_response_xml: str) -> Mock:
def mock_api_client():
mock_client = Mock(spec=MarklogicApiClient)
mock_client.get_judgment_xml_bytestring.return_value = b"<xml>content</xml>"
mock_client.get_property_as_node.return_value = None

return mock_client
Loading

0 comments on commit 1dc5934

Please sign in to comment.