diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c64f69..caafe8c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,9 +24,11 @@ jobs: SKIP: no-commit-to-branch test: env: - MARKLOGIC_HOST: "" - MARKLOGIC_USER: "" - MARKLOGIC_PASSWORD: "" + MARKLOGIC_HOST: ml-host + MARKLOGIC_USER: ml-user + MARKLOGIC_PASSWORD: ml-password + MARKLOGIC_USE_HTTPS: 0 + AWS_BUCKET_NAME: judgments-original-versions name: Run unit tests runs-on: ubuntu-24.04 steps: diff --git a/ds-caselaw-ingester/lambda_function.py b/ds-caselaw-ingester/lambda_function.py index 7c488e0..8ff5fce 100644 --- a/ds-caselaw-ingester/lambda_function.py +++ b/ds-caselaw-ingester/lambda_function.py @@ -22,6 +22,7 @@ from notifications_python_client.notifications import NotificationsAPIClient import logging from caselawclient.models.documents import Document +from uuid import uuid4 logger = logging.getLogger("ingester") logger.setLevel(logging.DEBUG) @@ -30,7 +31,7 @@ rollbar.init(os.getenv("ROLLBAR_TOKEN"), environment=os.getenv("ROLLBAR_ENV")) -MARKLOGIC_HOST: str = os.environ["AWS_BUCKET_NAME"] +MARKLOGIC_HOST: str = os.environ["MARKLOGIC_HOST"] MARKLOGIC_USER: str = os.environ["MARKLOGIC_USER"] MARKLOGIC_PASSWORD: str = os.environ["MARKLOGIC_PASSWORD"] MARKLOGIC_USE_HTTPS: bool = bool(os.environ["MARKLOGIC_USE_HTTPS"]) @@ -238,18 +239,6 @@ def extract_metadata(tar: tarfile.TarFile, consignment_reference: str): return decoder.decode(te_metadata_file.read().decode("utf-8")) -def extract_uri(metadata: dict, consignment_reference: str) -> str: - uri = metadata["parameters"]["PARSER"].get("uri", "") - - if uri: - uri = uri.replace("https://caselaw.nationalarchives.gov.uk/id/", "") - - if not uri: - uri = f"failures/{consignment_reference}" - - return uri - - # called by tests def get_consignment_reference(message): return Message.from_message(message).get_consignment_reference() @@ -409,7 +398,7 @@ def __init__(self, message: Message): self.message.update_consignment_reference(self.metadata["parameters"]["TRE"]["reference"]) self.consignment_reference = self.message.get_consignment_reference() self.xml_file_name = self.metadata["parameters"]["TRE"]["payload"]["xml"] - self.uri = DocumentURIString(extract_uri(self.metadata, self.consignment_reference)) + self.uri = DocumentURIString("d-" + str(uuid4())) print(f"Ingesting document {self.uri}") self.xml = get_best_xml(self.uri, self.tar, self.xml_file_name, self.consignment_reference) diff --git a/ds-caselaw-ingester/tests.py b/ds-caselaw-ingester/tests.py index 93f6abe..c1d8939 100644 --- a/ds-caselaw-ingester/tests.py +++ b/ds-caselaw-ingester/tests.py @@ -280,22 +280,6 @@ def test_extract_metadata_not_found_tdr(self): with pytest.raises(lambda_function.FileNotFoundException, match="Consignment Ref:"): lambda_function.extract_metadata(tar, consignment_reference) - def test_extract_uri_success(self): - metadata = {"parameters": {"PARSER": {"uri": "https://caselaw.nationalarchives.gov.uk/id/ewca/civ/2022/111"}}} - assert lambda_function.extract_uri(metadata, "anything") == "ewca/civ/2022/111" - - def test_extract_uri_incompete(self): - metadata = {"parameters": {"PARSER": {"uri": "https://caselaw.nationalarchives.gov.uk/id/"}}} - assert lambda_function.extract_uri(metadata, "anything") == "failures/anything" - - def test_extract_uri_missing_key(self): - metadata = {"parameters": {"PARSER": {}}} - assert lambda_function.extract_uri(metadata, "anything") == "failures/anything" - - def test_extract_uri_none(self): - metadata = {"parameters": {"PARSER": {"uri": None}}} - assert lambda_function.extract_uri(metadata, "anything") == "failures/anything" - def test_extract_docx_filename_success(self): metadata = {"parameters": {"TRE": {"payload": {"filename": "judgment.docx"}}}} assert lambda_function.extract_docx_filename(metadata, "anything") == "judgment.docx"