Merge pull request #810 from nationalarchives/FCL-533-pui-needs-to-be…

…-able-to-select-the-best-identifier-for-any-document [FCL-533] ⚠️ Introduce methods to select the best identifier in various cases
nationalarchives · Dec 18, 2024 · 54332a2 · 54332a2
2 parents 33002a4 + b204454
commit 54332a2
Show file tree

Hide file tree

Showing 11 changed files with 143 additions and 62 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog 1.0.0].
 
+## Unreleased
+
+### BREAKING CHANGE
+
+- Methods which were previously guaranteed to return a Neutral Citation may now return `None`.
+
+### Feat
+
+- **FCL-533**: getting scored or preferred identifiers can now be done by type
+- **FCL-533**: modify human identifier to rely on identifiers framework
+- **FCL-533**: add scoring to Identifiers
+
+### Fix
+
+- **IdentifierSchema**: use hasattr instead of getattr with a default when testing required attributes
+
 ## v28.2.0 (2024-12-17)
 
 ### Feat

diff --git a/src/caselawclient/models/documents/__init__.py b/src/caselawclient/models/documents/__init__.py
@@ -15,6 +15,7 @@
     NotSupportedOnVersion,
     OnlySupportedOnVersion,
 )
+from caselawclient.models.identifiers import Identifier
 from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema
 from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree
 from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
@@ -171,13 +172,11 @@ def _initialise_identifiers(self) -> None:
         self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)
 
     @property
-    def best_human_identifier(self) -> Optional[str]:
-        """
-        Some identifier that is understood by legal professionals to refer to this legal event
-        that is not the name of the document.
-        Typically, this will be the neutral citation number, should it exist.
-        Should typically be overridden in subclasses.
-        """
+    def best_human_identifier(self) -> Optional[Identifier]:
+        """Return the preferred identifier for the document, providing that it is considered human readable."""
+        preferred_identifier = self.identifiers.preferred()
+        if preferred_identifier and preferred_identifier.schema.human_readable:
+            return preferred_identifier
         return None
 
     @property
@@ -507,13 +506,17 @@ def force_reparse(self) -> None:
             "documentType": parser_type_noun,
             "metadata": {
                 "name": self.body.name or None,
-                "cite": self.best_human_identifier or None,
+                "cite": None,
                 "court": self.body.court or None,
                 "date": checked_date,
                 "uri": self.uri,
             },
         }
 
+        ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
+        if hasattr(self, "neutral_citation"):
+            parser_instructions["metadata"]["cite"] = self.neutral_citation
+
         request_parse(
             uri=self.uri,
             reference=self.consignment_reference,

diff --git a/src/caselawclient/models/identifiers/__init__.py b/src/caselawclient/models/identifiers/__init__.py
@@ -32,13 +32,20 @@ class IdentifierSchema(ABC):
     name: str
     namespace: str
 
+    human_readable: bool
+    """ Should this identifier type be considered for display as a 'human readable' identifier? """
+
+    base_score_multiplier: float = 1.0
+    """ A multiplier used to adjust the relative ranking of this identifier when calculating preferred identifiers. """
+
     def __init_subclass__(cls: type["IdentifierSchema"], **kwargs: Any) -> None:
         """Ensure that subclasses have the required attributes set."""
         for required in (
             "name",
             "namespace",
+            "human_readable",
         ):
-            if not getattr(cls, required, False):
+            if not hasattr(cls, required):
                 raise NotImplementedError(f"Can't instantiate IdentifierSchema without {required} attribute.")
         super().__init_subclass__(**kwargs)
 
@@ -101,6 +108,11 @@ def as_xml_tree(self) -> etree._Element:
     def url_slug(self) -> str:
         return self.schema.compile_identifier_url_slug(self.value)
 
+    @property
+    def score(self) -> float:
+        """Return the score of this identifier, used to calculate the preferred identifier for a document."""
+        return 1 * self.schema.base_score_multiplier
+
     def same_as(self, other: "Identifier") -> bool:
         "Is this the same as another identifier (in value and schema)?"
         return self.value == other.value and self.schema == other.schema
@@ -149,3 +161,22 @@ def as_etree(self) -> etree._Element:
             identifiers_root.append(identifier.as_xml_tree)
 
         return identifiers_root
+
+    def by_score(self, type: Optional[type[Identifier]] = None) -> list[Identifier]:
+        """
+        :param type: Optionally, an identifier type to constrain this list to.
+
+        :return: Return a list of identifiers, sorted by their score in descending order.
+        """
+        identifiers = self.of_type(type) if type else list(self.values())
+        return sorted(identifiers, key=lambda v: v.score, reverse=True)
+
+    def preferred(self, type: Optional[type[Identifier]] = None) -> Optional[Identifier]:
+        """
+        :param type: Optionally, an identifier type to constrain the results to.
+
+        :return: Return the highest scoring identifier of the given type (or of any type, if none is specified). Returns `None` if no identifier is available.
+        """
+        if len(self) == 0:
+            return None
+        return self.by_score(type)[0]
diff --git a/src/caselawclient/models/identifiers/fclid.py b/src/caselawclient/models/identifiers/fclid.py
@@ -27,6 +27,8 @@ class FindCaseLawIdentifierSchema(IdentifierSchema):
 
     name = "Find Case Law Identifier"
     namespace = "fclid"
+    human_readable = False
+    base_score_multiplier = 0.8
 
     @classmethod
     def validate_identifier(cls, value: str) -> bool:

diff --git a/src/caselawclient/models/identifiers/neutral_citation.py b/src/caselawclient/models/identifiers/neutral_citation.py
@@ -30,6 +30,8 @@ class NeutralCitationNumberSchema(IdentifierSchema):
 
     name = "Neutral Citation Number"
     namespace = "ukncn"
+    human_readable = True
+    base_score_multiplier = 1.5
 
     @classmethod
     def validate_identifier(cls, value: str) -> bool:

diff --git a/src/caselawclient/models/judgments.py b/src/caselawclient/models/judgments.py
@@ -25,20 +25,17 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
         super().__init__(self.document_noun, uri, *args, **kwargs)
 
     @cached_property
-    def neutral_citation(self) -> NeutralCitationString:
-        return NeutralCitationString(
-            self.body.get_xpath_match_string(
-                "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
-                {
-                    "uk": "https://caselaw.nationalarchives.gov.uk/akn",
-                    "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
-                },
-            )
+    def neutral_citation(self) -> Optional[NeutralCitationString]:
+        value_in_xml = self.body.get_xpath_match_string(
+            "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:cite/text()",
+            {
+                "uk": "https://caselaw.nationalarchives.gov.uk/akn",
+                "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
+            },
         )
-
-    @property
-    def best_human_identifier(self) -> str:
-        return self.neutral_citation
+        if value_in_xml:
+            return NeutralCitationString(value_in_xml)
+        return None
 
     @cached_property
     def linked_document(self) -> Optional["PressSummary"]:

diff --git a/src/caselawclient/models/neutral_citation_mixin.py b/src/caselawclient/models/neutral_citation_mixin.py
@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
 from functools import cached_property
-from typing import Any
+from typing import Any, Optional
 
 from ds_caselaw_utils import neutral_url
 from ds_caselaw_utils.types import NeutralCitationString
+from typing_extensions import deprecated
 
 
 class NeutralCitationMixin(ABC):
@@ -38,12 +39,15 @@ def __init__(self, document_noun: str, *args: Any, **kwargs: Any) -> None:
 
     @cached_property
     @abstractmethod
-    def neutral_citation(self) -> NeutralCitationString: ...
+    @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
+    def neutral_citation(self) -> Optional[NeutralCitationString]: ...
 
     @cached_property
+    @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
     def has_ncn(self) -> bool:
-        return bool(self.neutral_citation)
+        return self.neutral_citation is not None and self.neutral_citation != ""
 
     @cached_property
+    @deprecated("Legacy usage of NCNs is deprecated; you should be moving to the Identifiers framework")
     def has_valid_ncn(self) -> bool:
-        return self.has_ncn and neutral_url(self.neutral_citation) is not None
+        return self.neutral_citation is not None and neutral_url(self.neutral_citation) is not None
diff --git a/src/caselawclient/models/press_summaries.py b/src/caselawclient/models/press_summaries.py
@@ -27,19 +27,16 @@ def __init__(self, uri: DocumentURIString, *args: Any, **kwargs: Any) -> None:
         super().__init__(self.document_noun, uri, *args, **kwargs)
 
     @cached_property
-    def neutral_citation(self) -> NeutralCitationString:
-        return NeutralCitationString(
-            self.body.get_xpath_match_string(
-                "/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
-                {
-                    "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
-                },
-            )
+    def neutral_citation(self) -> Optional[NeutralCitationString]:
+        value_in_xml = self.body.get_xpath_match_string(
+            "/akn:akomaNtoso/akn:doc/akn:preface/akn:p/akn:neutralCitation/text()",
+            {
+                "akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0",
+            },
         )
-
-    @property
-    def best_human_identifier(self) -> str:
-        return self.neutral_citation
+        if value_in_xml:
+            return NeutralCitationString(value_in_xml)
+        return None
 
     @cached_property
     def linked_document(self) -> Optional[Judgment]:

diff --git a/tests/models/identifiers/test_identifiers.py b/tests/models/identifiers/test_identifiers.py
@@ -5,20 +5,44 @@
 from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
 
 
+class TestIdentifierSchema(IdentifierSchema):
+    __test__ = False
+
+    name = "Test Schema"
+    namespace = "test"
+    human_readable = True
+    base_score_multiplier = 2.5
+
+    @classmethod
+    def compile_identifier_url_slug(cls, value: str) -> str:
+        return value.lower()
+
+
+class TestIdentifier(Identifier):
+    __test__ = False
+
+    schema = TestIdentifierSchema
+
+
 @pytest.fixture
 def identifiers():
     return Identifiers(
         {"id-1": TestIdentifier(uuid="id-1", value="TEST-111"), "id-2": TestIdentifier(uuid="id-2", value="TEST-222")}
     )
 
 
+TEST_NCN_1701 = NeutralCitationNumber(value="[1701] UKSC 999")
+TEST_NCN_1234 = NeutralCitationNumber(value="[1234] UKSC 999")
+TEST_IDENTIFIER_999 = TestIdentifier("TEST-999")
+
+
 @pytest.fixture
 def mixed_identifiers():
     return Identifiers(
         {
-            "id-A": NeutralCitationNumber(value="[1701] UKSC 999"),
-            "id-B": TestIdentifier("TEST-999"),
-            "id-C": NeutralCitationNumber(value="[1234] UKSC 999"),
+            "id-A": TEST_NCN_1701,
+            "id-B": TEST_IDENTIFIER_999,
+            "id-C": TEST_NCN_1234,
         }
     )
 
@@ -28,23 +52,6 @@ def id3():
     return TestIdentifier(uuid="id-3", value="TEST-333")
 
 
-class TestIdentifierSchema(IdentifierSchema):
-    __test__ = False
-
-    name = "Test Schema"
-    namespace = "test"
-
-    @classmethod
-    def compile_identifier_url_slug(cls, value: str) -> str:
-        return value.lower()
-
-
-class TestIdentifier(Identifier):
-    __test__ = False
-
-    schema = TestIdentifierSchema
-
-
 class TestIdentifierBase:
     def test_uuid_setting(self):
         """Ensure that if a UUID is provided when initialising an Identifier that it is properly stored."""
@@ -116,3 +123,21 @@ def test_delete_type(self, mixed_identifiers):
         assert "TEST-999" in str(mixed_identifiers)
         assert "[1701] UKSC 999" not in str(mixed_identifiers)
         assert "[1234] UKSC 999" not in str(mixed_identifiers)
+
+
+class TestIdentifierScoring:
+    def test_base_scoring(self):
+        identifier = TestIdentifier(value="TEST-123")
+        assert identifier.score == 2.5
+
+    def test_sorting(self, mixed_identifiers):
+        assert mixed_identifiers.by_score() == [TEST_IDENTIFIER_999, TEST_NCN_1701, TEST_NCN_1234]
+
+    def test_preferred_identifier(self, mixed_identifiers):
+        assert mixed_identifiers.preferred() == TEST_IDENTIFIER_999
+
+    def test_sorting_with_type(self, mixed_identifiers):
+        assert mixed_identifiers.by_score(type=NeutralCitationNumber) == [TEST_NCN_1701, TEST_NCN_1234]
+
+    def test_preferred_identifier_with_type(self, mixed_identifiers):
+        assert mixed_identifiers.preferred(type=NeutralCitationNumber) == TEST_NCN_1701
diff --git a/tests/models/test_judgments.py b/tests/models/test_judgments.py
@@ -5,15 +5,17 @@
 from caselawclient.errors import DocumentNotFoundError
 from caselawclient.factories import PressSummaryFactory
 from caselawclient.models.documents import DocumentURIString
+from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
 from caselawclient.models.judgments import Judgment
 from caselawclient.models.neutral_citation_mixin import NeutralCitationString
 
 
 class TestJudgment:
     def test_best_identifier(self, mock_api_client):
         judgment = Judgment(DocumentURIString("test/1234"), mock_api_client)
-        judgment.neutral_citation = NeutralCitationString("[2023] TEST 1234")
-        assert judgment.best_human_identifier == judgment.neutral_citation
+        document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
+        judgment.identifiers.add(document_ncn)
+        assert judgment.best_human_identifier == document_ncn
 
 
 class TestJudgmentValidation:

diff --git a/tests/models/test_press_summaries.py b/tests/models/test_press_summaries.py
@@ -5,15 +5,17 @@
 from caselawclient.errors import DocumentNotFoundError
 from caselawclient.factories import JudgmentFactory
 from caselawclient.models.documents import DocumentURIString
+from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
 from caselawclient.models.neutral_citation_mixin import NeutralCitationString
 from caselawclient.models.press_summaries import PressSummary
 
 
 class TestPressSummary:
     def test_best_identifier(self, mock_api_client):
         summary = PressSummary(DocumentURIString("test/1234"), mock_api_client)
-        summary.neutral_citation = NeutralCitationString("[2023] TEST 1234")
-        assert summary.best_human_identifier == summary.neutral_citation
+        document_ncn = NeutralCitationNumber(value="[2023] TEST 1234")
+        summary.identifiers.add(document_ncn)
+        assert summary.best_human_identifier == document_ncn
 
 
 class TestPressSummaryValidation: