Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FCL-485] Add NCN to identifiers on ingest #230

Merged
merged 3 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MARKLOGIC_HOST=host.docker.internal
MARKLOGIC_USER=
MARKLOGIC_PASSWORD=
MARKLOGIC_USER=admin
MARKLOGIC_PASSWORD=admin
XSLT_IMAGE_LOCATION=
AWS_ACCESS_KEY_ID=123
AWS_SECRET_KEY=xyz
Expand Down
49 changes: 37 additions & 12 deletions ds-caselaw-ingester/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import os
import tarfile
import xml.etree.ElementTree as ET
from typing import Dict, List, Tuple
from urllib.parse import unquote_plus
from xml.sax.saxutils import escape
from caselawclient.models.identifiers.neutral_citation import NeutralCitationNumber
from caselawclient.models.documents import DocumentURIString

import boto3
import rollbar
Expand All @@ -18,6 +19,11 @@
from caselawclient.client_helpers import VersionAnnotation, VersionType
from dotenv import load_dotenv
from notifications_python_client.notifications import NotificationsAPIClient
import logging
from caselawclient.models.documents import Document

logger = logging.getLogger("ingester")
logger.setLevel(logging.DEBUG)

load_dotenv()
rollbar.init(os.getenv("ROLLBAR_TOKEN"), environment=os.getenv("ROLLBAR_ENV"))
Expand All @@ -37,7 +43,7 @@ def __init__(self, metadata):
self.parameters = metadata.get("parameters", {})

@property
def is_tdr(self):
def is_tdr(self) -> bool:
return "TDR" in self.parameters.keys()

@property
Expand Down Expand Up @@ -86,7 +92,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@property
def originator(self):
def originator(self) -> str:
return self.message.get("parameters", {}).get("originator")

def get_consignment_reference(self):
Expand All @@ -100,7 +106,7 @@ def get_consignment_reference(self):

raise InvalidMessageException("Malformed v2 message, please supply a reference")

def save_s3_response(self, sqs_client, s3_client):
def save_s3_response(self, sqs_client, s3_client) -> str:
s3_bucket = self.message.get("parameters", {}).get("s3Bucket")
s3_key = self.message.get("parameters", {}).get("s3Key")
reference = self.get_consignment_reference()
Expand Down Expand Up @@ -195,7 +201,7 @@ def modify_filename(original: str, addition: str) -> str:
return os.path.join(path, new_basename)


def all_messages(event) -> List[Message]:
def all_messages(event) -> list[Message]:
"""All the messages in the SNS event, as Message subclasses"""
decoder = json.decoder.JSONDecoder()
messages_as_decoded_json = [decoder.decode(record["Sns"]["Message"]) for record in event["Records"]]
Expand Down Expand Up @@ -249,7 +255,7 @@ def extract_docx_filename(metadata: dict, consignment_reference: str) -> str:
)


def extract_lambda_versions(versions: List[Dict[str, str]]) -> List[Tuple[str, str]]:
def extract_lambda_versions(versions: list[dict[str, str]]) -> list[tuple[str, str]]:
version_tuples = []
for d in versions:
version_tuples += list(d.items())
Expand Down Expand Up @@ -437,6 +443,24 @@ def insert_document_xml(self) -> bool:
api_client.insert_document_xml(self.uri, self.xml, annotation)
return True

def set_document_identifiers(self) -> None:
doc = api_client.get_document_by_uri(DocumentURIString(self.uri))
if doc.identifiers:
msg = f"Ingesting, but identifiers already present for {self.uri}!"
logger.warning(msg)

try:
ncn = doc.neutral_citation
except AttributeError:
ncn = None

if ncn:
doc.identifiers.add(NeutralCitationNumber(ncn))
doc.identifiers.save(doc)
logger.info(f"Ingested document had NCN {ncn}")
else:
logger.info(f"Ingested document had NCN (NOT FOUND)")

def send_updated_judgment_notification(self) -> None:
personalisation = personalise_email(self.uri, self.metadata)
if os.getenv("ROLLBAR_ENV") != "prod":
Expand Down Expand Up @@ -501,7 +525,7 @@ def store_metadata(self) -> None:
value=tdr_metadata["Consignment-Completed-Datetime"],
)

def save_files_to_s3(self):
def save_files_to_s3(self) -> None:
sqs_client, s3_client = aws_clients()
# Determine if there's a word document -- we need to know before we save the tar.gz file
docx_filename = extract_docx_filename(self.metadata, self.consignment_reference)
Expand Down Expand Up @@ -555,7 +579,7 @@ def save_files_to_s3(self):
)

@property
def metadata_object(self):
def metadata_object(self) -> Metadata:
return Metadata(self.metadata)

def will_publish(self) -> bool:
Expand All @@ -574,7 +598,7 @@ def will_publish(self) -> bool:

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def send_email(self):
def send_email(self) -> None:
originator = self.message.originator
if originator == "FCL":
return None
Expand All @@ -587,19 +611,20 @@ def send_email(self):

raise RuntimeError(f"Didn't recognise originator {originator!r}")

def close_tar(self):
def close_tar(self) -> None:
self.tar.close()

def upload_xml(self):
def upload_xml(self) -> None:
self.updated = self.update_document_xml()
self.inserted = False if self.updated else self.insert_document_xml()
if not self.updated and not self.inserted:
raise DocumentInsertionError(
f"Judgment {self.uri} failed to insert into Marklogic. Consignment Ref: {self.consignment_reference}"
)
self.set_document_identifiers()

@property
def upload_state(self):
def upload_state(self) -> str:
return "updated" if self.updated else "inserted"


Expand Down
9 changes: 9 additions & 0 deletions ds-caselaw-ingester/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def test_handler_messages_v2(
capsys,
):
boto_session.return_value.client.return_value.download_file = create_fake_tdr_file
doc = apiclient.get_document_by_uri.return_value
doc.neutral_citation = None

message = v2_message_raw
event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]}
Expand All @@ -161,6 +163,8 @@ def test_handler_messages_v2(
payload=ANY,
)
assert annotation.call_count == 2
doc.identifiers.add.assert_not_called()
doc.identifiers.save.assert_not_called()

@patch("lambda_function.api_client", autospec=True)
@patch("lambda_function.boto3.session.Session")
Expand All @@ -180,6 +184,8 @@ def test_handler_messages_s3(
):
"""Test that, with appropriate stubs, an S3 message passes through the parsing process"""
boto_session.return_value.client.return_value.download_file = create_fake_bulk_file
doc = apiclient.get_document_by_uri.return_value
doc.neutral_citation = "[2012] UKUT 82 (IAC)"

message = s3_message_raw
event = {"Records": [{"Sns": {"Message": message}}, {"Sns": {"Message": message}}]}
Expand All @@ -200,13 +206,16 @@ def test_handler_messages_s3(
notify_new.assert_not_called()
notify_updated.assert_not_called()
modify_filename.assert_not_called()

annotation.assert_called_with(
ANY,
automated=True,
message="Updated document uploaded by Find Case Law",
payload=ANY,
)
assert annotation.call_count == 2
assert doc.identifiers.add.call_args_list[0].args[0].value == "[2012] UKUT 82 (IAC)"
doc.identifiers.save.assert_called()


class TestLambda:
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
django-environ~=0.10
ds-caselaw-marklogic-api-client==27.4.0
ds-caselaw-marklogic-api-client==28.1.0
requests-toolbelt~=1.0
urllib3~=2.2
boto3
Expand Down
Loading