-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…work-for-storing-multiple-identifiers-against-documents [FCL-309] Add the ability for documents to store abstract identifiers in MarkLogic properties
- Loading branch information
Showing
15 changed files
with
499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Any, Optional | ||
from uuid import uuid4 | ||
|
||
from lxml import etree | ||
|
||
IDENTIFIER_PACKABLE_ATTRIBUTES: list[str] = [ | ||
"uuid", | ||
"value", | ||
"url_slug", | ||
] | ||
|
||
IDENTIFIER_UNPACKABLE_ATTRIBUTES: list[str] = [ | ||
"uuid", | ||
"value", | ||
] | ||
|
||
|
||
class InvalidIdentifierXMLRepresentationException(Exception): | ||
pass | ||
|
||
|
||
class IdentifierSchema(ABC): | ||
""" | ||
A base class which describes what an identifier schema should look like. | ||
""" | ||
|
||
name: str | ||
namespace: str | ||
|
||
def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None: | ||
"""Ensure that subclasses have the required attributes set.""" | ||
for required in ( | ||
"name", | ||
"namespace", | ||
): | ||
if not getattr(cls, required, False): | ||
raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.") | ||
super().__init_subclass__(**kwargs) | ||
|
||
def __repr__(self) -> str: | ||
return self.name | ||
|
||
@classmethod | ||
@abstractmethod | ||
def validate_identifier(cls, value: str) -> bool: | ||
"""Check that any given identifier value is valid for this schema.""" | ||
pass | ||
|
||
@classmethod | ||
@abstractmethod | ||
def compile_identifier_url_slug(cls, value: str) -> str: | ||
"""Convert an identifier into a precompiled URL slug.""" | ||
pass | ||
|
||
|
||
class Identifier(ABC): | ||
"""A base class for subclasses representing a concrete identifier.""" | ||
|
||
schema: type[IdentifierSchema] | ||
|
||
uuid: str | ||
value: str | ||
|
||
def __init_subclass__(cls: type["Identifier"], **kwargs: Any) -> None: | ||
"""Ensure that subclasses have the required attributes set.""" | ||
for required in ("schema",): | ||
if not getattr(cls, required, False): | ||
raise NotImplementedError(f"Can't instantiate Identifier without {required} attribute.") | ||
super().__init_subclass__(**kwargs) | ||
|
||
def __repr__(self) -> str: | ||
return f"{self.uuid} ({self.schema.name}): {self.value}" | ||
|
||
def __init__(self, value: str, uuid: Optional[str] = None) -> None: | ||
self.value = value | ||
if uuid: | ||
self.uuid = uuid | ||
else: | ||
self.uuid = "id-" + str(uuid4()) | ||
|
||
@property | ||
def as_xml_tree(self) -> etree._Element: | ||
"""Convert this Identifier into a packed XML representation for storage.""" | ||
identifier_root = etree.Element("identifier") | ||
|
||
namespace_attribute = etree.SubElement(identifier_root, "namespace") | ||
namespace_attribute.text = self.schema.namespace | ||
|
||
for attribute in IDENTIFIER_PACKABLE_ATTRIBUTES: | ||
packed_attribute = etree.SubElement(identifier_root, attribute) | ||
packed_attribute.text = getattr(self, attribute) | ||
|
||
return identifier_root | ||
|
||
@property | ||
def url_slug(self) -> str: | ||
return self.schema.compile_identifier_url_slug(self.value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import re | ||
|
||
from ds_caselaw_utils import neutral_url | ||
from ds_caselaw_utils.types import NeutralCitationString | ||
|
||
from . import Identifier, IdentifierSchema | ||
|
||
VALID_NCN_PATTERN = re.compile(r"(^\[([0-9]{4})\] ([a-zA-Z]+)(?: ([a-zA-Z]+))? ([0-9]+)(?: \(([a-zA-Z]+)\))?$)") | ||
""" | ||
This is a catch-all pattern for anything which looks like a Neutral Citation, even if the court itself isn't valid. Checking that an NCN is plausibly correct is handled elsewhere. | ||
This pattern also defines five capture groups to standardise how we interface with the elements: | ||
- `0`: The year of the decision | ||
- `1`: The court | ||
- `2`: (Optionally) the jurisdiction or division, depending on the court | ||
- `3`: The sequence number of the decision | ||
- `4`: (Optionally) the jurisdiction or division, depending on the court | ||
TODO: When these capture groups are being used in anger (eg to build URL slugs) you should go through and name the groups. | ||
""" | ||
|
||
|
||
class NeutralCitationNumberSchema(IdentifierSchema): | ||
""" | ||
Identifier schema describing a Neutral Citation Number. | ||
https://www.iclr.co.uk/knowledge/case-law/neutral-citations/ | ||
""" | ||
|
||
name = "Neutral Citation Number" | ||
namespace = "ukncn" | ||
|
||
@classmethod | ||
def validate_identifier(cls, value: str) -> bool: | ||
return bool(VALID_NCN_PATTERN.match(value)) | ||
|
||
@classmethod | ||
def compile_identifier_url_slug(cls, value: str) -> str: | ||
ncn_based_uri_string = neutral_url( | ||
NeutralCitationString(value) | ||
) # TODO: At some point this should move out of utils and into this class. | ||
if not ncn_based_uri_string: | ||
raise Exception(f"Unable to convert NCN {value} into NCN-based URL slug") | ||
return ncn_based_uri_string | ||
|
||
|
||
class NeutralCitationNumber(Identifier): | ||
schema = NeutralCitationNumberSchema |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from lxml import etree | ||
|
||
from . import IDENTIFIER_UNPACKABLE_ATTRIBUTES, Identifier, InvalidIdentifierXMLRepresentationException | ||
from .neutral_citation import NeutralCitationNumber | ||
|
||
IDENTIFIER_NAMESPACE_MAP: dict[str, type[Identifier]] = { | ||
"ukncn": NeutralCitationNumber, | ||
} | ||
|
||
|
||
def unpack_identifier_from_etree(identifier_xml: etree._Element) -> Identifier: | ||
"""Given an etree representation of an identifier, unpack it into an appropriate instance of an Identifier.""" | ||
|
||
namespace_element = identifier_xml.find("namespace") | ||
|
||
if namespace_element is None or not namespace_element.text: | ||
raise InvalidIdentifierXMLRepresentationException( | ||
"Identifer XML representation is not valid: namespace not present or empty" | ||
) | ||
|
||
kwargs: dict[str, str] = {} | ||
|
||
for attribute in IDENTIFIER_UNPACKABLE_ATTRIBUTES: | ||
element = identifier_xml.find(attribute) | ||
if element is None or not element.text: | ||
raise InvalidIdentifierXMLRepresentationException( | ||
f"Identifer XML representation is not valid: {element} not present or empty" | ||
) | ||
kwargs[attribute] = element.text | ||
|
||
return IDENTIFIER_NAMESPACE_MAP[namespace_element.text](**kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
xquery version "1.0-ml"; | ||
|
||
declare variable $uri as xs:string external; | ||
|
||
declare variable $name as xs:string external; | ||
|
||
let $prop := fn:QName("", $name) | ||
|
||
return xdmp:document-get-properties($uri, $prop) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
xquery version "1.0-ml"; | ||
|
||
import module namespace dls = "http://marklogic.com/xdmp/dls" at "/MarkLogic/dls.xqy"; | ||
|
||
declare variable $uri as xs:string external; | ||
declare variable $value as xs:string external; | ||
declare variable $name as xs:string external; | ||
|
||
let $props := ( element {$name} {xdmp:unquote($value)/*/*} ) | ||
|
||
return dls:document-set-property($uri, $props) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.