From f647e4d07bf19a10a2b0dff9e951b6ec5d73eeaa Mon Sep 17 00:00:00 2001 From: David McKee Date: Thu, 12 Dec 2024 11:21:56 +0000 Subject: [PATCH 1/3] Add default local marklogic username/passwords to .env.example --- .env.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index 2dfad34..522a78b 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,6 @@ MARKLOGIC_HOST=host.docker.internal -MARKLOGIC_USER= -MARKLOGIC_PASSWORD= +MARKLOGIC_USER=admin +MARKLOGIC_PASSWORD=admin XSLT_IMAGE_LOCATION= AWS_ACCESS_KEY_ID=123 AWS_SECRET_KEY=xyz From 8b982c12f00f3e919e268f401ce4d8df335ccc65 Mon Sep 17 00:00:00 2001 From: David McKee Date: Thu, 12 Dec 2024 11:23:07 +0000 Subject: [PATCH 2/3] Add types to a lot of functions --- ds-caselaw-ingester/lambda_function.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ds-caselaw-ingester/lambda_function.py b/ds-caselaw-ingester/lambda_function.py index f6b1e17..a3b701e 100644 --- a/ds-caselaw-ingester/lambda_function.py +++ b/ds-caselaw-ingester/lambda_function.py @@ -37,7 +37,7 @@ def __init__(self, metadata): self.parameters = metadata.get("parameters", {}) @property - def is_tdr(self): + def is_tdr(self) -> bool: return "TDR" in self.parameters.keys() @property @@ -86,7 +86,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @property - def originator(self): + def originator(self) -> str: return self.message.get("parameters", {}).get("originator") def get_consignment_reference(self): @@ -100,7 +100,7 @@ def get_consignment_reference(self): raise InvalidMessageException("Malformed v2 message, please supply a reference") - def save_s3_response(self, sqs_client, s3_client): + def save_s3_response(self, sqs_client, s3_client) -> str: s3_bucket = self.message.get("parameters", {}).get("s3Bucket") s3_key = self.message.get("parameters", {}).get("s3Key") reference = self.get_consignment_reference() @@ -195,7 +195,7 @@ def modify_filename(original: str, addition: str) -> str: return os.path.join(path, new_basename) -def all_messages(event) -> List[Message]: +def all_messages(event) -> list[Message]: """All the messages in the SNS event, as Message subclasses""" decoder = json.decoder.JSONDecoder() messages_as_decoded_json = [decoder.decode(record["Sns"]["Message"]) for record in event["Records"]] @@ -249,7 +249,7 @@ def extract_docx_filename(metadata: dict, consignment_reference: str) -> str: ) -def extract_lambda_versions(versions: List[Dict[str, str]]) -> List[Tuple[str, str]]: +def extract_lambda_versions(versions: list[dict[str, str]]) -> list[tuple[str, str]]: version_tuples = [] for d in versions: version_tuples += list(d.items()) @@ -501,7 +501,7 @@ def store_metadata(self) -> None: value=tdr_metadata["Consignment-Completed-Datetime"], ) - def save_files_to_s3(self): + def save_files_to_s3(self) -> None: sqs_client, s3_client = aws_clients() # Determine if there's a word document -- we need to know before we save the tar.gz file docx_filename = extract_docx_filename(self.metadata, self.consignment_reference) @@ -555,7 +555,7 @@ def save_files_to_s3(self): ) @property - def metadata_object(self): + def metadata_object(self) -> Metadata: return Metadata(self.metadata) def will_publish(self) -> bool: @@ -574,7 +574,7 @@ def will_publish(self) -> bool: raise RuntimeError(f"Didn't recognise originator {originator!r}") - def send_email(self): + def send_email(self) -> None: originator = self.message.originator if originator == "FCL": return None @@ -587,10 +587,10 @@ def send_email(self): raise RuntimeError(f"Didn't recognise originator {originator!r}") - def close_tar(self): + def close_tar(self) -> None: self.tar.close() - def upload_xml(self): + def upload_xml(self) -> None: self.updated = self.update_document_xml() self.inserted = False if self.updated else self.insert_document_xml() if not self.updated and not self.inserted: @@ -599,7 +599,7 @@ def upload_xml(self): ) @property - def upload_state(self): + def upload_state(self) -> str: return "updated" if self.updated else "inserted" From d094d48a58ca79b3a7dc23f19af7e759d763dd21 Mon Sep 17 00:00:00 2001 From: David McKee Date: Thu, 12 Dec 2024 11:22:54 +0000 Subject: [PATCH 3/3] Add NCN to identifiers on ingest --- ds-caselaw-ingester/lambda_function.py | 27 +++++++++++++++++++++++++- ds-caselaw-ingester/tests.py | 9 +++++++++ requirements/base.txt | 2 +- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/ds-caselaw-ingester/lambda_function.py b/ds-caselaw-ingester/lambda_function.py index a3b701e..950b44c 100644 --- a/ds-caselaw-ingester/lambda_function.py +++ b/ds-caselaw-ingester/lambda_function.py @@ -2,9 +2,10 @@ import os import tarfile import xml.etree.ElementTree as ET -from typing import Dict, List, Tuple from urllib.parse import unquote_plus from xml.sax.saxutils import escape +from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber +from caselawclient.models.documents import DocumentURIString import boto3 import rollbar @@ -18,6 +19,11 @@ from caselawclient.client_helpers import VersionAnnotation, VersionType from dotenv import load_dotenv from notifications_python_client.notifications import NotificationsAPIClient +import logging +from caselawclient.models.documents import Document + +logger = logging.getLogger("ingester") +logger.setLevel(logging.DEBUG) load_dotenv() rollbar.init(os.getenv("ROLLBAR_TOKEN"), environment=os.getenv("ROLLBAR_ENV")) @@ -437,6 +443,24 @@ def insert_document_xml(self) -> bool: api_client.insert_document_xml(self.uri, self.xml, annotation) return True + def set_document_identifiers(self) -> None: + doc = api_client.get_document_by_uri(DocumentURIString(self.uri)) + if doc.identifiers: + msg = f"Ingesting, but identifiers already present for {self.uri}!" + logger.warning(msg) + + try: + ncn = doc.neutral_citation + except AttributeError: + ncn = None + + if ncn: + doc.identifiers.add(NeutralCitationNumber(ncn)) + doc.identifiers.save(doc) + logger.info(f"Ingested document had NCN {ncn}") + else: + logger.info(f"Ingested document had NCN (NOT FOUND)") + def send_updated_judgment_notification(self) -> None: personalisation = personalise_email(self.uri, self.metadata) if os.getenv("ROLLBAR_ENV") != "prod": @@ -597,6 +621,7 @@ def upload_xml(self) -> None: raise DocumentInsertionError( f"Judgment {self.uri} failed to insert into Marklogic. Consignment Ref: {self.consignment_reference}" ) + self.set_document_identifiers() @property def upload_state(self) -> str: diff --git a/ds-caselaw-ingester/tests.py b/ds-caselaw-ingester/tests.py index 6c74b92..6c54d9a 100644 --- a/ds-caselaw-ingester/tests.py +++ b/ds-caselaw-ingester/tests.py @@ -140,6 +140,8 @@ def test_handler_messages_v2( capsys, ): boto_session.return_value.client.return_value.download_file = create_fake_tdr_file + doc = apiclient.get_document_by_uri.return_value + doc.neutral_citation = None message = v2_message_raw event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]} @@ -161,6 +163,8 @@ def test_handler_messages_v2( payload=ANY, ) assert annotation.call_count == 2 + doc.identifiers.add.assert_not_called() + doc.identifiers.save.assert_not_called() @patch("lambda_function.api_client", autospec=True) @patch("lambda_function.boto3.session.Session") @@ -180,6 +184,8 @@ def test_handler_messages_s3( ): """Test that, with appropriate stubs, an S3 message passes through the parsing process""" boto_session.return_value.client.return_value.download_file = create_fake_bulk_file + doc = apiclient.get_document_by_uri.return_value + doc.neutral_citation = "[2012] UKUT 82 (IAC)" message = s3_message_raw event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]} @@ -200,6 +206,7 @@ def test_handler_messages_s3( notify_new.assert_not_called() notify_updated.assert_not_called() modify_filename.assert_not_called() + annotation.assert_called_with( ANY, automated=True, @@ -207,6 +214,8 @@ def test_handler_messages_s3( payload=ANY, ) assert annotation.call_count == 2 + assert doc.identifiers.add.call_args_list[0].args[0].value == "[2012] UKUT 82 (IAC)" + doc.identifiers.save.assert_called() class TestLambda: diff --git a/requirements/base.txt b/requirements/base.txt index 0f79558..38e9627 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ django-environ~=0.10 -ds-caselaw-marklogic-api-client==27.4.0 +ds-caselaw-marklogic-api-client==28.1.0 requests-toolbelt~=1.0 urllib3~=2.2 boto3