Skip to content

Commit

Permalink
Merge pull request #230 from nationalarchives/FCL-496-add-ncn-to-iden…
Browse files Browse the repository at this point in the history
…tifiers-on-ingest

[FCL-485] Add NCN to identifiers on ingest
  • Loading branch information
dragon-dxw authored Dec 12, 2024
2 parents 0564cbe + d094d48 commit 6635cdb
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 15 deletions.
4 changes: 2 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MARKLOGIC_HOST=host.docker.internal
MARKLOGIC_USER=
MARKLOGIC_PASSWORD=
MARKLOGIC_USER=admin
MARKLOGIC_PASSWORD=admin
XSLT_IMAGE_LOCATION=
AWS_ACCESS_KEY_ID=123
AWS_SECRET_KEY=xyz
Expand Down
49 changes: 37 additions & 12 deletions ds-caselaw-ingester/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import os
import tarfile
import xml.etree.ElementTree as ET
from typing import Dict, List, Tuple
from urllib.parse import unquote_plus
from xml.sax.saxutils import escape
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.documents import DocumentURIString

import boto3
import rollbar
Expand All @@ -18,6 +19,11 @@
from caselawclient.client_helpers import VersionAnnotation, VersionType
from dotenv import load_dotenv
from notifications_python_client.notifications import NotificationsAPIClient
import logging
from caselawclient.models.documents import Document

logger = logging.getLogger("ingester")
logger.setLevel(logging.DEBUG)

load_dotenv()
rollbar.init(os.getenv("ROLLBAR_TOKEN"), environment=os.getenv("ROLLBAR_ENV"))
Expand All @@ -37,7 +43,7 @@ def __init__(self, metadata):
self.parameters = metadata.get("parameters", {})

@property
def is_tdr(self):
def is_tdr(self) -> bool:
return "TDR" in self.parameters.keys()

@property
Expand Down Expand Up @@ -86,7 +92,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@property
def originator(self):
def originator(self) -> str:
return self.message.get("parameters", {}).get("originator")

def get_consignment_reference(self):
Expand All @@ -100,7 +106,7 @@ def get_consignment_reference(self):

raise InvalidMessageException("Malformed v2 message, please supply a reference")

def save_s3_response(self, sqs_client, s3_client):
def save_s3_response(self, sqs_client, s3_client) -> str:
s3_bucket = self.message.get("parameters", {}).get("s3Bucket")
s3_key = self.message.get("parameters", {}).get("s3Key")
reference = self.get_consignment_reference()
Expand Down Expand Up @@ -195,7 +201,7 @@ def modify_filename(original: str, addition: str) -> str:
return os.path.join(path, new_basename)


def all_messages(event) -> List[Message]:
def all_messages(event) -> list[Message]:
"""All the messages in the SNS event, as Message subclasses"""
decoder = json.decoder.JSONDecoder()
messages_as_decoded_json = [decoder.decode(record["Sns"]["Message"]) for record in event["Records"]]
Expand Down Expand Up @@ -249,7 +255,7 @@ def extract_docx_filename(metadata: dict, consignment_reference: str) -> str:
)


def extract_lambda_versions(versions: List[Dict[str, str]]) -> List[Tuple[str, str]]:
def extract_lambda_versions(versions: list[dict[str, str]]) -> list[tuple[str, str]]:
version_tuples = []
for d in versions:
version_tuples += list(d.items())
Expand Down Expand Up @@ -437,6 +443,24 @@ def insert_document_xml(self) -> bool:
api_client.insert_document_xml(self.uri, self.xml, annotation)
return True

def set_document_identifiers(self) -> None:
doc = api_client.get_document_by_uri(DocumentURIString(self.uri))
if doc.identifiers:
msg = f"Ingesting, but identifiers already present for {self.uri}!"
logger.warning(msg)

try:
ncn = doc.neutral_citation
except AttributeError:
ncn = None

if ncn:
doc.identifiers.add(NeutralCitationNumber(ncn))
doc.identifiers.save(doc)
logger.info(f"Ingested document had NCN {ncn}")
else:
logger.info(f"Ingested document had NCN (NOT FOUND)")

def send_updated_judgment_notification(self) -> None:
personalisation = personalise_email(self.uri, self.metadata)
if os.getenv("ROLLBAR_ENV") != "prod":
Expand Down Expand Up @@ -501,7 +525,7 @@ def store_metadata(self) -> None:
value=tdr_metadata["Consignment-Completed-Datetime"],
)

def save_files_to_s3(self):
def save_files_to_s3(self) -> None:
sqs_client, s3_client = aws_clients()
# Determine if there's a word document -- we need to know before we save the tar.gz file
docx_filename = extract_docx_filename(self.metadata, self.consignment_reference)
Expand Down Expand Up @@ -555,7 +579,7 @@ def save_files_to_s3(self):
)

@property
def metadata_object(self):
def metadata_object(self) -> Metadata:
return Metadata(self.metadata)

def will_publish(self) -> bool:
Expand All @@ -574,7 +598,7 @@ def will_publish(self) -> bool:

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def send_email(self):
def send_email(self) -> None:
originator = self.message.originator
if originator == "FCL":
return None
Expand All @@ -587,19 +611,20 @@ def send_email(self):

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def close_tar(self):
def close_tar(self) -> None:
self.tar.close()

def upload_xml(self):
def upload_xml(self) -> None:
self.updated = self.update_document_xml()
self.inserted = False if self.updated else self.insert_document_xml()
if not self.updated and not self.inserted:
raise DocumentInsertionError(
f"Judgment {self.uri} failed to insert into Marklogic. Consignment Ref: {self.consignment_reference}"
)
self.set_document_identifiers()

@property
def upload_state(self):
def upload_state(self) -> str:
return "updated" if self.updated else "inserted"


Expand Down
9 changes: 9 additions & 0 deletions ds-caselaw-ingester/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def test_handler_messages_v2(
capsys,
):
boto_session.return_value.client.return_value.download_file = create_fake_tdr_file
doc = apiclient.get_document_by_uri.return_value
doc.neutral_citation = None

message = v2_message_raw
event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]}
Expand All @@ -161,6 +163,8 @@ def test_handler_messages_v2(
payload=ANY,
)
assert annotation.call_count == 2
doc.identifiers.add.assert_not_called()
doc.identifiers.save.assert_not_called()

@patch("lambda_function.api_client", autospec=True)
@patch("lambda_function.boto3.session.Session")
Expand All @@ -180,6 +184,8 @@ def test_handler_messages_s3(
):
"""Test that, with appropriate stubs, an S3 message passes through the parsing process"""
boto_session.return_value.client.return_value.download_file = create_fake_bulk_file
doc = apiclient.get_document_by_uri.return_value
doc.neutral_citation = "[2012] UKUT 82 (IAC)"

message = s3_message_raw
event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]}
Expand All @@ -200,13 +206,16 @@ def test_handler_messages_s3(
notify_new.assert_not_called()
notify_updated.assert_not_called()
modify_filename.assert_not_called()

annotation.assert_called_with(
ANY,
automated=True,
message="Updated document uploaded by Find Case Law",
payload=ANY,
)
assert annotation.call_count == 2
assert doc.identifiers.add.call_args_list[0].args[0].value == "[2012] UKUT 82 (IAC)"
doc.identifiers.save.assert_called()


class TestLambda:
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
django-environ~=0.10
ds-caselaw-marklogic-api-client==27.4.0
ds-caselaw-marklogic-api-client==28.1.0
requests-toolbelt~=1.0
urllib3~=2.2
boto3
Expand Down

0 comments on commit 6635cdb

Please sign in to comment.