Skip to content

Commit

Permalink
Merge pull request #810 from nationalarchives/FCL-533-pui-needs-to-be…
Browse files Browse the repository at this point in the history
…-able-to-select-the-best-identifier-for-any-document

[FCL-533] ⚠️ Introduce methods to select the best identifier in various cases
  • Loading branch information
jacksonj04 authored Dec 18, 2024
2 parents 33002a4 + b204454 commit 54332a2
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 62 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog 1.0.0].

## Unreleased

### BREAKING CHANGE

- Methods which were previously guaranteed to return a Neutral Citation may now return `None`.

### Feat

- **FCL-533**: getting scored or preferred identifiers can now be done by type
- **FCL-533**: modify human identifier to rely on identifiers framework
- **FCL-533**: add scoring to Identifiers

### Fix

- **IdentifierSchema**: use hasattr instead of getattr with a default when testing required attributes

## v28.2.0 (2024-12-17)

### Feat
Expand Down
19 changes: 11 additions & 8 deletions src/caselawclient/models/documents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
NotSupportedOnVersion,
OnlySupportedOnVersion,
)
from caselawclient.models.identifiers import Identifier
from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema
from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree
from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
Expand Down Expand Up @@ -171,13 +172,11 @@ def _initialise_identifiers(self) -> None:
self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)

@property
def best_human_identifier(self) -> Optional[str]:
"""
Some identifier that is understood by legal professionals to refer to this legal event
that is not the name of the document.
Typically, this will be the neutral citation number, should it exist.
Should typically be overridden in subclasses.
"""
def best_human_identifier(self) -> Optional[Identifier]:
"""Return the preferred identifier for the document, providing that it is considered human readable."""
preferred_identifier = self.identifiers.preferred()
if preferred_identifier and preferred_identifier.schema.human_readable:
return preferred_identifier
return None

@property
Expand Down Expand Up @@ -507,13 +506,17 @@ def force_reparse(self) -> None:
"documentType": parser_type_noun,
"metadata": {
"name": self.body.name or None,
"cite": self.best_human_identifier or None,
"cite": None,
"court": self.body.court or None,
"date": checked_date,
"uri": self.uri,
},
}

## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
if hasattr(self, "neutral_citation"):
parser_instructions["metadata"]["cite"] = self.neutral_citation

request_parse(
uri=self.uri,
reference=self.consignment_reference,
Expand Down
33 changes: 32 additions & 1 deletion src/caselawclient/models/identifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,20 @@ class IdentifierSchema(ABC):
name: str
namespace: str

human_readable: bool
""" Should this identifier type be considered for display as a 'human readable' identifier? """

base_score_multiplier: float = 1.0
""" A multiplier used to adjust the relative ranking of this identifier when calculating preferred identifiers. """

def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None:
"""Ensure that subclasses have the required attributes set."""
for required in (
"name",
"namespace",
"human_readable",
):
if not getattr(cls, required, False):
if not hasattr(cls, required):
raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.")
super().__init_subclass__(**kwargs)

Expand Down Expand Up @@ -101,6 +108,11 @@ def as_xml_tree(self) -> etree._Element:
def url_slug(self) -> str:
return self.schema.compile_identifier_url_slug(self.value)

@property
def score(self) -> float:
"""Return the score of this identifier, used to calculate the preferred identifier for a document."""
return 1 * self.schema.base_score_multiplier

def same_as(self, other: "Identifier") -> bool:
"Is this the same as another identifier (in value and schema)?"
return self.value == other.value and self.schema == other.schema
Expand Down Expand Up @@ -149,3 +161,22 @@ def as_etree(self) -> etree._Element:
identifiers_root.append(identifier.as_xml_tree)

return identifiers_root

def by_score(self, type: Optional[type[Identifier]] = None) -> list[Identifier]:
"""
:param type: Optionally, an identifier type to constrain this list to.
:return: Return a list of identifiers, sorted by their score in descending order.
"""
identifiers = self.of_type(type) if type else list(self.values())
return sorted(identifiers, key=lambda v: v.score, reverse=True)

def preferred(self, type: Optional[type[Identifier]] = None) -> Optional[Identifier]:
"""
:param type: Optionally, an identifier type to constrain the results to.
:return: Return the highest scoring identifier of the given type (or of any type, if none is specified). Returns `None` if no identifier is available.
"""
if len(self) == 0:
return None
return self.by_score(type)[0]
2 changes: 2 additions & 0 deletions src/caselawclient/models/identifiers/fclid.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class FindCaseLawIdentifierSchema(IdentifierSchema):

name = "Find Case Law Identifier"
namespace = "fclid"
human_readable = False
base_score_multiplier = 0.8

@classmethod
def validate_identifier(cls, value: str) -> bool:
Expand Down
2 changes: 2 additions & 0 deletions src/caselawclient/models/identifiers/neutral_citation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class NeutralCitationNumberSchema(IdentifierSchema):

name = "Neutral Citation Number"
namespace = "ukncn"
human_readable = True
base_score_multiplier = 1.5

@classmethod
def validate_identifier(cls, value: str) -> bool:
Expand Down
23 changes: 10 additions & 13 deletions src/caselawclient/models/judgments.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,17 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
super().__init__(self.document_noun, uri, *args, **kwargs)

@cached_property
def neutral_citation(self) -> NeutralCitationString:
return NeutralCitationString(
self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
{
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)
def neutral_citation(self) -> Optional[NeutralCitationString]:
value_in_xml = self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
{
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)

@property
def best_human_identifier(self) -> str:
return self.neutral_citation
if value_in_xml:
return NeutralCitationString(value_in_xml)
return None

@cached_property
def linked_document(self) -> Optional["PressSummary"]:
Expand Down
12 changes: 8 additions & 4 deletions src/caselawclient/models/neutral_citation_mixin.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Any
from typing import Any, Optional

from ds_caselaw_utils import neutral_url
from ds_caselaw_utils.types import NeutralCitationString
from typing_extensions import deprecated


class NeutralCitationMixin(ABC):
Expand Down Expand Up @@ -38,12 +39,15 @@ def __init__(self, document_noun: str, *args: Any, **kwargs: Any) -> None:

@cached_property
@abstractmethod
def neutral_citation(self) -> NeutralCitationString: ...
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def neutral_citation(self) -> Optional[NeutralCitationString]: ...

@cached_property
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def has_ncn(self) -> bool:
return bool(self.neutral_citation)
return self.neutral_citation is not None and self.neutral_citation != ""

@cached_property
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def has_valid_ncn(self) -> bool:
return self.has_ncn and neutral_url(self.neutral_citation) is not None
return self.neutral_citation is not None and neutral_url(self.neutral_citation) is not None
21 changes: 9 additions & 12 deletions src/caselawclient/models/press_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,16 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
super().__init__(self.document_noun, uri, *args, **kwargs)

@cached_property
def neutral_citation(self) -> NeutralCitationString:
return NeutralCitationString(
self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
{
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)
def neutral_citation(self) -> Optional[NeutralCitationString]:
value_in_xml = self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
{
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)

@property
def best_human_identifier(self) -> str:
return self.neutral_citation
if value_in_xml:
return NeutralCitationString(value_in_xml)
return None

@cached_property
def linked_document(self) -> Optional[Judgment]:
Expand Down
65 changes: 45 additions & 20 deletions tests/models/identifiers/test_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,44 @@
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber


class TestIdentifierSchema(IdentifierSchema):
__test__ = False

name = "Test Schema"
namespace = "test"
human_readable = True
base_score_multiplier = 2.5

@classmethod
def compile_identifier_url_slug(cls, value: str) -> str:
return value.lower()


class TestIdentifier(Identifier):
__test__ = False

schema = TestIdentifierSchema


@pytest.fixture
def identifiers():
return Identifiers(
{"id-1": TestIdentifier(uuid="id-1", value="TEST-111"), "id-2": TestIdentifier(uuid="id-2", value="TEST-222")}
)


TEST_NCN_1701 = NeutralCitationNumber(value="[1701] UKSC 999")
TEST_NCN_1234 = NeutralCitationNumber(value="[1234] UKSC 999")
TEST_IDENTIFIER_999 = TestIdentifier("TEST-999")


@pytest.fixture
def mixed_identifiers():
return Identifiers(
{
"id-A": NeutralCitationNumber(value="[1701] UKSC 999"),
"id-B": TestIdentifier("TEST-999"),
"id-C": NeutralCitationNumber(value="[1234] UKSC 999"),
"id-A": TEST_NCN_1701,
"id-B": TEST_IDENTIFIER_999,
"id-C": TEST_NCN_1234,
}
)

Expand All @@ -28,23 +52,6 @@ def id3():
return TestIdentifier(uuid="id-3", value="TEST-333")


class TestIdentifierSchema(IdentifierSchema):
__test__ = False

name = "Test Schema"
namespace = "test"

@classmethod
def compile_identifier_url_slug(cls, value: str) -> str:
return value.lower()


class TestIdentifier(Identifier):
__test__ = False

schema = TestIdentifierSchema


class TestIdentifierBase:
def test_uuid_setting(self):
"""Ensure that if a UUID is provided when initialising an Identifier that it is properly stored."""
Expand Down Expand Up @@ -116,3 +123,21 @@ def test_delete_type(self, mixed_identifiers):
assert "TEST-999" in str(mixed_identifiers)
assert "[1701] UKSC 999" not in str(mixed_identifiers)
assert "[1234] UKSC 999" not in str(mixed_identifiers)


class TestIdentifierScoring:
def test_base_scoring(self):
identifier = TestIdentifier(value="TEST-123")
assert identifier.score == 2.5

def test_sorting(self, mixed_identifiers):
assert mixed_identifiers.by_score() == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234]

def test_preferred_identifier(self, mixed_identifiers):
assert mixed_identifiers.preferred() == TEST_IDENTIFIER_999

def test_sorting_with_type(self, mixed_identifiers):
assert mixed_identifiers.by_score(type=NeutralCitationNumber) == [TEST_NCN_1701, TEST_NCN_1234]

def test_preferred_identifier_with_type(self, mixed_identifiers):
assert mixed_identifiers.preferred(type=NeutralCitationNumber) == TEST_NCN_1701
6 changes: 4 additions & 2 deletions tests/models/test_judgments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
from caselawclient.errors import DocumentNotFoundError
from caselawclient.factories import PressSummaryFactory
from caselawclient.models.documents import DocumentURIString
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.judgments import Judgment
from caselawclient.models.neutral_citation_mixin import NeutralCitationString


class TestJudgment:
def test_best_identifier(self, mock_api_client):
judgment = Judgment(DocumentURIString("test/1234"), mock_api_client)
judgment.neutral_citation = NeutralCitationString("[2023] TEST 1234")
assert judgment.best_human_identifier == judgment.neutral_citation
document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
judgment.identifiers.add(document_ncn)
assert judgment.best_human_identifier == document_ncn


class TestJudgmentValidation:
Expand Down
6 changes: 4 additions & 2 deletions tests/models/test_press_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
from caselawclient.errors import DocumentNotFoundError
from caselawclient.factories import JudgmentFactory
from caselawclient.models.documents import DocumentURIString
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.neutral_citation_mixin import NeutralCitationString
from caselawclient.models.press_summaries import PressSummary


class TestPressSummary:
def test_best_identifier(self, mock_api_client):
summary = PressSummary(DocumentURIString("test/1234"), mock_api_client)
summary.neutral_citation = NeutralCitationString("[2023] TEST 1234")
assert summary.best_human_identifier == summary.neutral_citation
document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
summary.identifiers.add(document_ncn)
assert summary.best_human_identifier == document_ncn


class TestPressSummaryValidation:
Expand Down

0 comments on commit 54332a2

Please sign in to comment.