Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FCL-533] ⚠️ Introduce methods to select the best identifier in various cases #810

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog 1.0.0].

## Unreleased

### BREAKING CHANGE

- Methods which were previously guaranteed to return a Neutral Citation may now return `None`.

### Feat

- **FCL-533**: getting scored or preferred identifiers can now be done by type
- **FCL-533**: modify human identifier to rely on identifiers framework
- **FCL-533**: add scoring to Identifiers

### Fix

- **IdentifierSchema**: use hasattr instead of getattr with a default when testing required attributes

## v28.2.0 (2024-12-17)

### Feat
Expand Down
19 changes: 11 additions & 8 deletions src/caselawclient/models/documents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
NotSupportedOnVersion,
OnlySupportedOnVersion,
)
from caselawclient.models.identifiers import Identifier
from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema
from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree
from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
Expand Down Expand Up @@ -171,13 +172,11 @@ def _initialise_identifiers(self) -> None:
self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)

@property
def best_human_identifier(self) -> Optional[str]:
"""
Some identifier that is understood by legal professionals to refer to this legal event
that is not the name of the document.
Typically, this will be the neutral citation number, should it exist.
Should typically be overridden in subclasses.
"""
def best_human_identifier(self) -> Optional[Identifier]:
Copy link
Collaborator

@dragon-dxw dragon-dxw Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do all identifiers have a sensible __str__ such that this is a good drop-in replacement despite the type change? (It's probably just the value)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, but I want this to explode where we're not using it properly. This is definitely a breaking change.

"""Return the preferred identifier for the document, providing that it is considered human readable."""
preferred_identifier = self.identifiers.preferred()
if preferred_identifier and preferred_identifier.schema.human_readable:
return preferred_identifier
return None

@property
Expand Down Expand Up @@ -507,13 +506,17 @@ def force_reparse(self) -> None:
"documentType": parser_type_noun,
"metadata": {
"name": self.body.name or None,
"cite": self.best_human_identifier or None,
"cite": None,
"court": self.body.court or None,
"date": checked_date,
"uri": self.uri,
},
}

## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
if hasattr(self, "neutral_citation"):
parser_instructions["metadata"]["cite"] = self.neutral_citation

request_parse(
uri=self.uri,
reference=self.consignment_reference,
Expand Down
33 changes: 32 additions & 1 deletion src/caselawclient/models/identifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,20 @@ class IdentifierSchema(ABC):
name: str
namespace: str

human_readable: bool
""" Should this identifier type be considered for display as a 'human readable' identifier? """

base_score_multiplier: float = 1.0
""" A multiplier used to adjust the relative ranking of this identifier when calculating preferred identifiers. """

def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None:
"""Ensure that subclasses have the required attributes set."""
for required in (
"name",
"namespace",
"human_readable",
):
if not getattr(cls, required, False):
if not hasattr(cls, required):
jacksonj04 marked this conversation as resolved.
Show resolved Hide resolved
raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.")
super().__init_subclass__(**kwargs)

Expand Down Expand Up @@ -101,6 +108,11 @@ def as_xml_tree(self) -> etree._Element:
def url_slug(self) -> str:
return self.schema.compile_identifier_url_slug(self.value)

@property
def score(self) -> float:
"""Return the score of this identifier, used to calculate the preferred identifier for a document."""
return 1 * self.schema.base_score_multiplier

def same_as(self, other: "Identifier") -> bool:
"Is this the same as another identifier (in value and schema)?"
return self.value == other.value and self.schema == other.schema
Expand Down Expand Up @@ -149,3 +161,22 @@ def as_etree(self) -> etree._Element:
identifiers_root.append(identifier.as_xml_tree)

return identifiers_root

def by_score(self, type: Optional[type[Identifier]] = None) -> list[Identifier]:
"""
:param type: Optionally, an identifier type to constrain this list to.

:return: Return a list of identifiers, sorted by their score in descending order.
"""
identifiers = self.of_type(type) if type else list(self.values())
return sorted(identifiers, key=lambda v: v.score, reverse=True)

def preferred(self, type: Optional[type[Identifier]] = None) -> Optional[Identifier]:
"""
:param type: Optionally, an identifier type to constrain the results to.

:return: Return the highest scoring identifier of the given type (or of any type, if none is specified). Returns `None` if no identifier is available.
"""
if len(self) == 0:
return None
return self.by_score(type)[0]
2 changes: 2 additions & 0 deletions src/caselawclient/models/identifiers/fclid.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class FindCaseLawIdentifierSchema(IdentifierSchema):

name = "Find Case Law Identifier"
namespace = "fclid"
human_readable = False
base_score_multiplier = 0.8

@classmethod
def validate_identifier(cls, value: str) -> bool:
Expand Down
2 changes: 2 additions & 0 deletions src/caselawclient/models/identifiers/neutral_citation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class NeutralCitationNumberSchema(IdentifierSchema):

name = "Neutral Citation Number"
namespace = "ukncn"
human_readable = True
base_score_multiplier = 1.5

@classmethod
def validate_identifier(cls, value: str) -> bool:
Expand Down
23 changes: 10 additions & 13 deletions src/caselawclient/models/judgments.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,17 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
super().__init__(self.document_noun, uri, *args, **kwargs)

@cached_property
def neutral_citation(self) -> NeutralCitationString:
return NeutralCitationString(
self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
{
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)
def neutral_citation(self) -> Optional[NeutralCitationString]:
value_in_xml = self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
{
"uk": "https://caselaw.nationalarchives.gov.uk/akn",
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)

@property
def best_human_identifier(self) -> str:
return self.neutral_citation
if value_in_xml:
return NeutralCitationString(value_in_xml)
return None

@cached_property
def linked_document(self) -> Optional["PressSummary"]:
Expand Down
12 changes: 8 additions & 4 deletions src/caselawclient/models/neutral_citation_mixin.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Any
from typing import Any, Optional

from ds_caselaw_utils import neutral_url
from ds_caselaw_utils.types import NeutralCitationString
from typing_extensions import deprecated


class NeutralCitationMixin(ABC):
Expand Down Expand Up @@ -38,12 +39,15 @@ def __init__(self, document_noun: str, *args: Any, **kwargs: Any) -> None:

@cached_property
@abstractmethod
def neutral_citation(self) -> NeutralCitationString: ...
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def neutral_citation(self) -> Optional[NeutralCitationString]: ...

@cached_property
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def has_ncn(self) -> bool:
return bool(self.neutral_citation)
return self.neutral_citation is not None and self.neutral_citation != ""

@cached_property
@deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
def has_valid_ncn(self) -> bool:
return self.has_ncn and neutral_url(self.neutral_citation) is not None
return self.neutral_citation is not None and neutral_url(self.neutral_citation) is not None
21 changes: 9 additions & 12 deletions src/caselawclient/models/press_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,16 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
super().__init__(self.document_noun, uri, *args, **kwargs)

@cached_property
def neutral_citation(self) -> NeutralCitationString:
return NeutralCitationString(
self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
{
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)
def neutral_citation(self) -> Optional[NeutralCitationString]:
value_in_xml = self.body.get_xpath_match_string(
"/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
{
"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
},
)

@property
def best_human_identifier(self) -> str:
return self.neutral_citation
if value_in_xml:
return NeutralCitationString(value_in_xml)
return None

@cached_property
def linked_document(self) -> Optional[Judgment]:
Expand Down
65 changes: 45 additions & 20 deletions tests/models/identifiers/test_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,44 @@
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber


class TestIdentifierSchema(IdentifierSchema):
__test__ = False

name = "Test Schema"
namespace = "test"
human_readable = True
base_score_multiplier = 2.5

@classmethod
def compile_identifier_url_slug(cls, value: str) -> str:
return value.lower()


class TestIdentifier(Identifier):
__test__ = False

schema = TestIdentifierSchema


@pytest.fixture
def identifiers():
return Identifiers(
{"id-1": TestIdentifier(uuid="id-1", value="TEST-111"), "id-2": TestIdentifier(uuid="id-2", value="TEST-222")}
)


TEST_NCN_1701 = NeutralCitationNumber(value="[1701] UKSC 999")
TEST_NCN_1234 = NeutralCitationNumber(value="[1234] UKSC 999")
TEST_IDENTIFIER_999 = TestIdentifier("TEST-999")


@pytest.fixture
def mixed_identifiers():
return Identifiers(
{
"id-A": NeutralCitationNumber(value="[1701] UKSC 999"),
"id-B": TestIdentifier("TEST-999"),
"id-C": NeutralCitationNumber(value="[1234] UKSC 999"),
"id-A": TEST_NCN_1701,
"id-B": TEST_IDENTIFIER_999,
"id-C": TEST_NCN_1234,
}
)

Expand All @@ -28,23 +52,6 @@ def id3():
return TestIdentifier(uuid="id-3", value="TEST-333")


class TestIdentifierSchema(IdentifierSchema):
__test__ = False

name = "Test Schema"
namespace = "test"

@classmethod
def compile_identifier_url_slug(cls, value: str) -> str:
return value.lower()


class TestIdentifier(Identifier):
__test__ = False

schema = TestIdentifierSchema


class TestIdentifierBase:
def test_uuid_setting(self):
"""Ensure that if a UUID is provided when initialising an Identifier that it is properly stored."""
Expand Down Expand Up @@ -116,3 +123,21 @@ def test_delete_type(self, mixed_identifiers):
assert "TEST-999" in str(mixed_identifiers)
assert "[1701] UKSC 999" not in str(mixed_identifiers)
assert "[1234] UKSC 999" not in str(mixed_identifiers)


class TestIdentifierScoring:
def test_base_scoring(self):
identifier = TestIdentifier(value="TEST-123")
assert identifier.score == 2.5

def test_sorting(self, mixed_identifiers):
assert mixed_identifiers.by_score() == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234]

def test_preferred_identifier(self, mixed_identifiers):
assert mixed_identifiers.preferred() == TEST_IDENTIFIER_999

def test_sorting_with_type(self, mixed_identifiers):
assert mixed_identifiers.by_score(type=NeutralCitationNumber) == [TEST_NCN_1701, TEST_NCN_1234]

def test_preferred_identifier_with_type(self, mixed_identifiers):
assert mixed_identifiers.preferred(type=NeutralCitationNumber) == TEST_NCN_1701
6 changes: 4 additions & 2 deletions tests/models/test_judgments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
from caselawclient.errors import DocumentNotFoundError
from caselawclient.factories import PressSummaryFactory
from caselawclient.models.documents import DocumentURIString
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.judgments import Judgment
from caselawclient.models.neutral_citation_mixin import NeutralCitationString


class TestJudgment:
def test_best_identifier(self, mock_api_client):
judgment = Judgment(DocumentURIString("test/1234"), mock_api_client)
judgment.neutral_citation = NeutralCitationString("[2023] TEST 1234")
assert judgment.best_human_identifier == judgment.neutral_citation
document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
judgment.identifiers.add(document_ncn)
assert judgment.best_human_identifier == document_ncn


class TestJudgmentValidation:
Expand Down
6 changes: 4 additions & 2 deletions tests/models/test_press_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
from caselawclient.errors import DocumentNotFoundError
from caselawclient.factories import JudgmentFactory
from caselawclient.models.documents import DocumentURIString
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.neutral_citation_mixin import NeutralCitationString
from caselawclient.models.press_summaries import PressSummary


class TestPressSummary:
def test_best_identifier(self, mock_api_client):
summary = PressSummary(DocumentURIString("test/1234"), mock_api_client)
summary.neutral_citation = NeutralCitationString("[2023] TEST 1234")
assert summary.best_human_identifier == summary.neutral_citation
document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
summary.identifiers.add(document_ncn)
assert summary.best_human_identifier == document_ncn


class TestPressSummaryValidation:
Expand Down
Loading