diff --git a/ds-caselaw-ingester/lambda_function.py b/ds-caselaw-ingester/lambda_function.py index df3e389..3c8890f 100644 --- a/ds-caselaw-ingester/lambda_function.py +++ b/ds-caselaw-ingester/lambda_function.py @@ -159,6 +159,16 @@ class DocumentInsertionError(ReportableException): pass +def modify_filename(original: str, addition: str) -> str: + "Add an addition after the filename, so TRE-2024-A.tar.gz becomes TRE-2024-A_nodocx.tar.gz" + path, basename = os.path.split(original) + # dot will be an empty string if there is no dot in the filename. + # prefix will be everything upto and not including the first dot. + prefix, dot, suffix = basename.partition(".") + new_basename = f"{prefix}{addition}{dot}{suffix}" + return os.path.join(path, new_basename) + + def all_messages(event) -> List[Message]: """All the messages in the SNS event, as Message subclasses""" decoder = json.decoder.JSONDecoder() @@ -505,12 +515,23 @@ def process_message(message): if has_TDR_data: store_metadata(uri, metadata) + # Determine if there's a word document -- we need to know before we save the tar.gz file + docx_filename = extract_docx_filename(metadata, consignment_reference) + print(f"extracted docx filename is {docx_filename!r}") + # Copy original tarfile - store_file(open(filename, mode="rb"), uri, os.path.basename(filename), s3_client) + modified_targz_filename = ( + filename if docx_filename else modify_filename(filename, "_nodocx") + ) + store_file( + open(modified_targz_filename, mode="rb"), + uri, + os.path.basename(filename), + s3_client, + ) + print(f"saved tar.gz as {modified_targz_filename!r}") # Store docx and rename - docx_filename = extract_docx_filename(metadata, consignment_reference) - print(f"extracted docx filename is {docx_filename!r}") # The docx_filename is None for files which have been reparsed. if docx_filename is not None: copy_file( diff --git a/ds-caselaw-ingester/tests.py b/ds-caselaw-ingester/tests.py index 35c47b6..6c93457 100644 --- a/ds-caselaw-ingester/tests.py +++ b/ds-caselaw-ingester/tests.py @@ -81,8 +81,10 @@ class TestHandler: @patch("lambda_function.send_updated_judgment_notification") @patch("lambda_function.send_new_judgment_notification") @patch("lambda_function.VersionAnnotation") + @patch("lambda_function.modify_filename") def test_handler_messages_v2( self, + modify_filename, annotation, notify_new, notify_update, @@ -114,6 +116,8 @@ def test_handler_messages_v2( notify_update.assert_called() assert notify_update.call_count == 2 notify_new.assert_not_called() + modify_filename.assert_not_called() + annotation.assert_called_with( ANY, automated=False, @@ -127,8 +131,10 @@ def test_handler_messages_v2( @patch("lambda_function.send_new_judgment_notification") @patch("lambda_function.send_updated_judgment_notification") @patch("lambda_function.VersionAnnotation") + @patch("lambda_function.modify_filename") def test_handler_messages_s3( self, + modify_filename, annotation, notify_new, notify_updated, @@ -161,6 +167,7 @@ def test_handler_messages_s3( assert apiclient.set_published.call_count == 2 notify_new.assert_not_called() notify_updated.assert_not_called() + modify_filename.assert_not_called() annotation.assert_called_with( ANY, automated=True, @@ -820,3 +827,18 @@ def test_unquote_s3(self, getenv, os): mock_s3_client.download_file.assert_called_with( ANY, "2010 Reported/[2010]/1.tar.gz", ANY ) + + +modify_filename_data = [ + ["TRE-2023-XYZ.tar.gz", "TRE-2023-XYZ_.tar.gz"], + ["/a/b/c.d.e", "/a/b/c_.d.e"], + [ + "", + "_", + ], +] + + +@pytest.mark.parametrize("was, now", modify_filename_data) +def test_modify_targz_filename(was, now): + assert lambda_function.modify_filename(was, addition="_") == now