From 51b6dc9fd7591b3de82583b11ae6aeb1f0ad5aa3 Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 18 Dec 2024 09:31:09 +0000 Subject: [PATCH 1/5] fix(IdentifierSchema): use hasattr instead of getattr with a default when testing required attributes Using `getattr` with a default of `False` will raise an unexpected error when the attribute is present and has a value of `False`. `hasattr` simply checks for its existence, regardless of value. --- src/caselawclient/models/identifiers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caselawclient/models/identifiers/__init__.py b/src/caselawclient/models/identifiers/__init__.py index d87a3015..47ad65eb 100644 --- a/src/caselawclient/models/identifiers/__init__.py +++ b/src/caselawclient/models/identifiers/__init__.py @@ -38,7 +38,7 @@ def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None: "name", "namespace", ): - if not getattr(cls, required, False): + if not hasattr(cls, required): raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.") super().__init_subclass__(**kwargs) From 13855d53bca97c061520c64dea21f868743ebcfc Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 18 Dec 2024 09:49:13 +0000 Subject: [PATCH 2/5] feat(FCL-533): add scoring to Identifiers This allows us to decide which of a range of possible identifiers should be preferred for a document --- .../models/identifiers/__init__.py | 24 ++++++++ src/caselawclient/models/identifiers/fclid.py | 2 + .../models/identifiers/neutral_citation.py | 2 + tests/models/identifiers/test_identifiers.py | 59 ++++++++++++------- 4 files changed, 67 insertions(+), 20 deletions(-) diff --git a/src/caselawclient/models/identifiers/__init__.py b/src/caselawclient/models/identifiers/__init__.py index 47ad65eb..1a6a8c5e 100644 --- a/src/caselawclient/models/identifiers/__init__.py +++ b/src/caselawclient/models/identifiers/__init__.py @@ -32,11 +32,18 @@ class IdentifierSchema(ABC): name: str namespace: str + human_readable: bool + """ Should this identifier type be considered for display as a 'human readable' identifier? """ + + base_score_multiplier: float = 1.0 + """ A multiplier used to adjust the relative ranking of this identifier when calculating preferred identifiers. """ + def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None: """Ensure that subclasses have the required attributes set.""" for required in ( "name", "namespace", + "human_readable", ): if not hasattr(cls, required): raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.") @@ -101,6 +108,11 @@ def as_xml_tree(self) -> etree._Element: def url_slug(self) -> str: return self.schema.compile_identifier_url_slug(self.value) + @property + def score(self) -> float: + """Return the score of this identifier, used to calculate the preferred identifier for a document.""" + return 1 * self.schema.base_score_multiplier + def same_as(self, other: "Identifier") -> bool: "Is this the same as another identifier (in value and schema)?" return self.value == other.value and self.schema == other.schema @@ -149,3 +161,15 @@ def as_etree(self) -> etree._Element: identifiers_root.append(identifier.as_xml_tree) return identifiers_root + + @property + def by_score(self) -> list[Identifier]: + """Return a list of identifiers, sorted by their score in descending order.""" + return sorted(self.values(), key=lambda v: v.score, reverse=True) + + @property + def preferred(self) -> Optional[Identifier]: + """If this document has identifiers, return the preferred one.""" + if len(self) == 0: + return None + return self.by_score[0] diff --git a/src/caselawclient/models/identifiers/fclid.py b/src/caselawclient/models/identifiers/fclid.py index 83e75891..a5dd68be 100644 --- a/src/caselawclient/models/identifiers/fclid.py +++ b/src/caselawclient/models/identifiers/fclid.py @@ -27,6 +27,8 @@ class FindCaseLawIdentifierSchema(IdentifierSchema): name = "Find Case Law Identifier" namespace = "fclid" + human_readable = False + base_score_multiplier = 0.8 @classmethod def validate_identifier(cls, value: str) -> bool: diff --git a/src/caselawclient/models/identifiers/neutral_citation.py b/src/caselawclient/models/identifiers/neutral_citation.py index b8fc6fe2..e4d40332 100644 --- a/src/caselawclient/models/identifiers/neutral_citation.py +++ b/src/caselawclient/models/identifiers/neutral_citation.py @@ -30,6 +30,8 @@ class NeutralCitationNumberSchema(IdentifierSchema): name = "Neutral Citation Number" namespace = "ukncn" + human_readable = True + base_score_multiplier = 1.5 @classmethod def validate_identifier(cls, value: str) -> bool: diff --git a/tests/models/identifiers/test_identifiers.py b/tests/models/identifiers/test_identifiers.py index d317eae8..b7739cf0 100644 --- a/tests/models/identifiers/test_identifiers.py +++ b/tests/models/identifiers/test_identifiers.py @@ -5,6 +5,25 @@ from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber +class TestIdentifierSchema(IdentifierSchema): + __test__ = False + + name = "Test Schema" + namespace = "test" + human_readable = True + base_score_multiplier = 2.5 + + @classmethod + def compile_identifier_url_slug(cls, value: str) -> str: + return value.lower() + + +class TestIdentifier(Identifier): + __test__ = False + + schema = TestIdentifierSchema + + @pytest.fixture def identifiers(): return Identifiers( @@ -12,13 +31,18 @@ def identifiers(): ) +TEST_NCN_1701 = NeutralCitationNumber(value="[1701] UKSC 999") +TEST_NCN_1234 = NeutralCitationNumber(value="[1234] UKSC 999") +TEST_IDENTIFIER_999 = TestIdentifier("TEST-999") + + @pytest.fixture def mixed_identifiers(): return Identifiers( { - "id-A": NeutralCitationNumber(value="[1701] UKSC 999"), - "id-B": TestIdentifier("TEST-999"), - "id-C": NeutralCitationNumber(value="[1234] UKSC 999"), + "id-A": TEST_NCN_1701, + "id-B": TEST_IDENTIFIER_999, + "id-C": TEST_NCN_1234, } ) @@ -28,23 +52,6 @@ def id3(): return TestIdentifier(uuid="id-3", value="TEST-333") -class TestIdentifierSchema(IdentifierSchema): - __test__ = False - - name = "Test Schema" - namespace = "test" - - @classmethod - def compile_identifier_url_slug(cls, value: str) -> str: - return value.lower() - - -class TestIdentifier(Identifier): - __test__ = False - - schema = TestIdentifierSchema - - class TestIdentifierBase: def test_uuid_setting(self): """Ensure that if a UUID is provided when initialising an Identifier that it is properly stored.""" @@ -116,3 +123,15 @@ def test_delete_type(self, mixed_identifiers): assert "TEST-999" in str(mixed_identifiers) assert "[1701] UKSC 999" not in str(mixed_identifiers) assert "[1234] UKSC 999" not in str(mixed_identifiers) + + +class TestIdentifierScoring: + def test_base_scoring(self): + identifier = TestIdentifier(value="TEST-123") + assert identifier.score == 2.5 + + def test_sorting(self, mixed_identifiers): + assert mixed_identifiers.by_score == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234] + + def test_preferred_identifier(self, mixed_identifiers): + assert mixed_identifiers.preferred == TEST_IDENTIFIER_999 From ec2c226c907a2f77c410b0b8a914c0f8afd5f694 Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 18 Dec 2024 10:41:05 +0000 Subject: [PATCH 3/5] feat(FCL-533): modify human identifier to rely on identifiers framework This is instead of assuming that an NCN will always be present. As a result, this has had knock-on changes in some other places which were previously relying on this incorrect behaviour. This includes some breaking changes to method signatures, which can now return a `None` in places they previously could not. Since relying on the presence of an NCN is now considered dangerous, these methods have been marked as deprecated. BREAKING CHANGE: Methods which were previously guaranteed to return a Neutral Citation may now return `None`. --- .../models/documents/__init__.py | 19 ++++++++------- src/caselawclient/models/judgments.py | 23 ++++++++----------- .../models/neutral_citation_mixin.py | 12 ++++++---- src/caselawclient/models/press_summaries.py | 21 ++++++++--------- tests/models/test_judgments.py | 6 +++-- tests/models/test_press_summaries.py | 6 +++-- 6 files changed, 46 insertions(+), 41 deletions(-) diff --git a/src/caselawclient/models/documents/__init__.py b/src/caselawclient/models/documents/__init__.py index 6af19c3e..a85f1763 100644 --- a/src/caselawclient/models/documents/__init__.py +++ b/src/caselawclient/models/documents/__init__.py @@ -15,6 +15,7 @@ NotSupportedOnVersion, OnlySupportedOnVersion, ) +from caselawclient.models.identifiers import Identifier from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree from caselawclient.models.utilities import VersionsDict, extract_version, render_versions @@ -171,13 +172,11 @@ def _initialise_identifiers(self) -> None: self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree) @property - def best_human_identifier(self) -> Optional[str]: - """ - Some identifier that is understood by legal professionals to refer to this legal event - that is not the name of the document. - Typically, this will be the neutral citation number, should it exist. - Should typically be overridden in subclasses. - """ + def best_human_identifier(self) -> Optional[Identifier]: + """Return the preferred identifier for the document, providing that it is considered human readable.""" + preferred_identifier = self.identifiers.preferred + if preferred_identifier and preferred_identifier.schema.human_readable: + return preferred_identifier return None @property @@ -507,13 +506,17 @@ def force_reparse(self) -> None: "documentType": parser_type_noun, "metadata": { "name": self.body.name or None, - "cite": self.best_human_identifier or None, + "cite": None, "court": self.body.court or None, "date": checked_date, "uri": self.uri, }, } + ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata + if hasattr(self, "neutral_citation"): + parser_instructions["metadata"]["cite"] = self.neutral_citation + request_parse( uri=self.uri, reference=self.consignment_reference, diff --git a/src/caselawclient/models/judgments.py b/src/caselawclient/models/judgments.py index 730c3c0c..a2d0fffe 100644 --- a/src/caselawclient/models/judgments.py +++ b/src/caselawclient/models/judgments.py @@ -25,20 +25,17 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None: super().__init__(self.document_noun, uri, *args, **kwargs) @cached_property - def neutral_citation(self) -> NeutralCitationString: - return NeutralCitationString( - self.body.get_xpath_match_string( - "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()", - { - "uk": "https://caselaw.nationalarchives.gov.uk/akn", - "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", - }, - ) + def neutral_citation(self) -> Optional[NeutralCitationString]: + value_in_xml = self.body.get_xpath_match_string( + "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()", + { + "uk": "https://caselaw.nationalarchives.gov.uk/akn", + "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", + }, ) - - @property - def best_human_identifier(self) -> str: - return self.neutral_citation + if value_in_xml: + return NeutralCitationString(value_in_xml) + return None @cached_property def linked_document(self) -> Optional["PressSummary"]: diff --git a/src/caselawclient/models/neutral_citation_mixin.py b/src/caselawclient/models/neutral_citation_mixin.py index 6b55a125..7f5c38df 100644 --- a/src/caselawclient/models/neutral_citation_mixin.py +++ b/src/caselawclient/models/neutral_citation_mixin.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod from functools import cached_property -from typing import Any +from typing import Any, Optional from ds_caselaw_utils import neutral_url from ds_caselaw_utils.types import NeutralCitationString +from typing_extensions import deprecated class NeutralCitationMixin(ABC): @@ -38,12 +39,15 @@ def __init__(self, document_noun: str, *args: Any, **kwargs: Any) -> None: @cached_property @abstractmethod - def neutral_citation(self) -> NeutralCitationString: ... + @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework") + def neutral_citation(self) -> Optional[NeutralCitationString]: ... @cached_property + @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework") def has_ncn(self) -> bool: - return bool(self.neutral_citation) + return self.neutral_citation is not None and self.neutral_citation != "" @cached_property + @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework") def has_valid_ncn(self) -> bool: - return self.has_ncn and neutral_url(self.neutral_citation) is not None + return self.neutral_citation is not None and neutral_url(self.neutral_citation) is not None diff --git a/src/caselawclient/models/press_summaries.py b/src/caselawclient/models/press_summaries.py index 2996c132..587ba051 100644 --- a/src/caselawclient/models/press_summaries.py +++ b/src/caselawclient/models/press_summaries.py @@ -27,19 +27,16 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None: super().__init__(self.document_noun, uri, *args, **kwargs) @cached_property - def neutral_citation(self) -> NeutralCitationString: - return NeutralCitationString( - self.body.get_xpath_match_string( - "/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()", - { - "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", - }, - ) + def neutral_citation(self) -> Optional[NeutralCitationString]: + value_in_xml = self.body.get_xpath_match_string( + "/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()", + { + "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0", + }, ) - - @property - def best_human_identifier(self) -> str: - return self.neutral_citation + if value_in_xml: + return NeutralCitationString(value_in_xml) + return None @cached_property def linked_document(self) -> Optional[Judgment]: diff --git a/tests/models/test_judgments.py b/tests/models/test_judgments.py index 814ea087..af43bb2f 100644 --- a/tests/models/test_judgments.py +++ b/tests/models/test_judgments.py @@ -5,6 +5,7 @@ from caselawclient.errors import DocumentNotFoundError from caselawclient.factories import PressSummaryFactory from caselawclient.models.documents import DocumentURIString +from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber from caselawclient.models.judgments import Judgment from caselawclient.models.neutral_citation_mixin import NeutralCitationString @@ -12,8 +13,9 @@ class TestJudgment: def test_best_identifier(self, mock_api_client): judgment = Judgment(DocumentURIString("test/1234"), mock_api_client) - judgment.neutral_citation = NeutralCitationString("[2023] TEST 1234") - assert judgment.best_human_identifier == judgment.neutral_citation + document_ncn = NeutralCitationNumber(value="[2023] TEST 1234") + judgment.identifiers.add(document_ncn) + assert judgment.best_human_identifier == document_ncn class TestJudgmentValidation: diff --git a/tests/models/test_press_summaries.py b/tests/models/test_press_summaries.py index cd4e461b..d5bf3251 100644 --- a/tests/models/test_press_summaries.py +++ b/tests/models/test_press_summaries.py @@ -5,6 +5,7 @@ from caselawclient.errors import DocumentNotFoundError from caselawclient.factories import JudgmentFactory from caselawclient.models.documents import DocumentURIString +from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber from caselawclient.models.neutral_citation_mixin import NeutralCitationString from caselawclient.models.press_summaries import PressSummary @@ -12,8 +13,9 @@ class TestPressSummary: def test_best_identifier(self, mock_api_client): summary = PressSummary(DocumentURIString("test/1234"), mock_api_client) - summary.neutral_citation = NeutralCitationString("[2023] TEST 1234") - assert summary.best_human_identifier == summary.neutral_citation + document_ncn = NeutralCitationNumber(value="[2023] TEST 1234") + summary.identifiers.add(document_ncn) + assert summary.best_human_identifier == document_ncn class TestPressSummaryValidation: From 22cc54d3d941495e1aff2b86a2f24ccb8bd62983 Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 18 Dec 2024 10:42:35 +0000 Subject: [PATCH 4/5] docs: update CHANGELOG --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a225f0bf..67edc429 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog 1.0.0]. +## Unreleased + +### BREAKING CHANGE + +- Methods which were previously guaranteed to return a Neutral Citation may now return `None`. + +### Feat + +- **FCL-533**: modify human identifier to rely on identifiers framework +- **FCL-533**: add scoring to Identifiers + +### Fix + +- **IdentifierSchema**: use hasattr instead of getattr with a default when testing required attributes + ## v28.2.0 (2024-12-17) ### Feat From b2044547e9ad06b8595dafe7ddcccc8f46a0eab0 Mon Sep 17 00:00:00 2001 From: Nick Jackson Date: Wed, 18 Dec 2024 11:17:38 +0000 Subject: [PATCH 5/5] feat(FCL-533): getting scored or preferred identifiers can now be done by type --- CHANGELOG.md | 1 + .../models/documents/__init__.py | 2 +- .../models/identifiers/__init__.py | 23 ++++++++++++------- tests/models/identifiers/test_identifiers.py | 10 ++++++-- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67edc429..e874bb8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog 1.0.0]. ### Feat +- **FCL-533**: getting scored or preferred identifiers can now be done by type - **FCL-533**: modify human identifier to rely on identifiers framework - **FCL-533**: add scoring to Identifiers diff --git a/src/caselawclient/models/documents/__init__.py b/src/caselawclient/models/documents/__init__.py index a85f1763..01ba119a 100644 --- a/src/caselawclient/models/documents/__init__.py +++ b/src/caselawclient/models/documents/__init__.py @@ -174,7 +174,7 @@ def _initialise_identifiers(self) -> None: @property def best_human_identifier(self) -> Optional[Identifier]: """Return the preferred identifier for the document, providing that it is considered human readable.""" - preferred_identifier = self.identifiers.preferred + preferred_identifier = self.identifiers.preferred() if preferred_identifier and preferred_identifier.schema.human_readable: return preferred_identifier return None diff --git a/src/caselawclient/models/identifiers/__init__.py b/src/caselawclient/models/identifiers/__init__.py index 1a6a8c5e..a87a3669 100644 --- a/src/caselawclient/models/identifiers/__init__.py +++ b/src/caselawclient/models/identifiers/__init__.py @@ -162,14 +162,21 @@ def as_etree(self) -> etree._Element: return identifiers_root - @property - def by_score(self) -> list[Identifier]: - """Return a list of identifiers, sorted by their score in descending order.""" - return sorted(self.values(), key=lambda v: v.score, reverse=True) + def by_score(self, type: Optional[type[Identifier]] = None) -> list[Identifier]: + """ + :param type: Optionally, an identifier type to constrain this list to. - @property - def preferred(self) -> Optional[Identifier]: - """If this document has identifiers, return the preferred one.""" + :return: Return a list of identifiers, sorted by their score in descending order. + """ + identifiers = self.of_type(type) if type else list(self.values()) + return sorted(identifiers, key=lambda v: v.score, reverse=True) + + def preferred(self, type: Optional[type[Identifier]] = None) -> Optional[Identifier]: + """ + :param type: Optionally, an identifier type to constrain the results to. + + :return: Return the highest scoring identifier of the given type (or of any type, if none is specified). Returns `None` if no identifier is available. + """ if len(self) == 0: return None - return self.by_score[0] + return self.by_score(type)[0] diff --git a/tests/models/identifiers/test_identifiers.py b/tests/models/identifiers/test_identifiers.py index b7739cf0..94a034ab 100644 --- a/tests/models/identifiers/test_identifiers.py +++ b/tests/models/identifiers/test_identifiers.py @@ -131,7 +131,13 @@ def test_base_scoring(self): assert identifier.score == 2.5 def test_sorting(self, mixed_identifiers): - assert mixed_identifiers.by_score == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234] + assert mixed_identifiers.by_score() == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234] def test_preferred_identifier(self, mixed_identifiers): - assert mixed_identifiers.preferred == TEST_IDENTIFIER_999 + assert mixed_identifiers.preferred() == TEST_IDENTIFIER_999 + + def test_sorting_with_type(self, mixed_identifiers): + assert mixed_identifiers.by_score(type=NeutralCitationNumber) == [TEST_NCN_1701, TEST_NCN_1234] + + def test_preferred_identifier_with_type(self, mixed_identifiers): + assert mixed_identifiers.preferred(type=NeutralCitationNumber) == TEST_NCN_1701