Skip to content

Commit

Permalink
Merge pull request #166 from nationalarchives/FCL-67-tar-gz-different…
Browse files Browse the repository at this point in the history
…-name-if-docx

Modify targz filename in S3 to include _nodocx if no docx
  • Loading branch information
dragon-dxw authored Apr 23, 2024
2 parents adc2452 + 0eb7b1d commit cbcbaa9
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 3 deletions.
27 changes: 24 additions & 3 deletions ds-caselaw-ingester/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ class DocumentInsertionError(ReportableException):
pass


def modify_filename(original: str, addition: str) -> str:
"Add an addition after the filename, so TRE-2024-A.tar.gz becomes TRE-2024-A_nodocx.tar.gz"
path, basename = os.path.split(original)
# dot will be an empty string if there is no dot in the filename.
# prefix will be everything upto and not including the first dot.
prefix, dot, suffix = basename.partition(".")
new_basename = f"{prefix}{addition}{dot}{suffix}"
return os.path.join(path, new_basename)


def all_messages(event) -> List[Message]:
"""All the messages in the SNS event, as Message subclasses"""
decoder = json.decoder.JSONDecoder()
Expand Down Expand Up @@ -505,12 +515,23 @@ def process_message(message):
if has_TDR_data:
store_metadata(uri, metadata)

# Determine if there's a word document -- we need to know before we save the tar.gz file
docx_filename = extract_docx_filename(metadata, consignment_reference)
print(f"extracted docx filename is {docx_filename!r}")

# Copy original tarfile
store_file(open(filename, mode="rb"), uri, os.path.basename(filename), s3_client)
modified_targz_filename = (
filename if docx_filename else modify_filename(filename, "_nodocx")
)
store_file(
open(modified_targz_filename, mode="rb"),
uri,
os.path.basename(filename),
s3_client,
)
print(f"saved tar.gz as {modified_targz_filename!r}")

# Store docx and rename
docx_filename = extract_docx_filename(metadata, consignment_reference)
print(f"extracted docx filename is {docx_filename!r}")
# The docx_filename is None for files which have been reparsed.
if docx_filename is not None:
copy_file(
Expand Down
22 changes: 22 additions & 0 deletions ds-caselaw-ingester/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ class TestHandler:
@patch("lambda_function.send_updated_judgment_notification")
@patch("lambda_function.send_new_judgment_notification")
@patch("lambda_function.VersionAnnotation")
@patch("lambda_function.modify_filename")
def test_handler_messages_v2(
self,
modify_filename,
annotation,
notify_new,
notify_update,
Expand Down Expand Up @@ -114,6 +116,8 @@ def test_handler_messages_v2(
notify_update.assert_called()
assert notify_update.call_count == 2
notify_new.assert_not_called()
modify_filename.assert_not_called()

annotation.assert_called_with(
ANY,
automated=False,
Expand All @@ -127,8 +131,10 @@ def test_handler_messages_v2(
@patch("lambda_function.send_new_judgment_notification")
@patch("lambda_function.send_updated_judgment_notification")
@patch("lambda_function.VersionAnnotation")
@patch("lambda_function.modify_filename")
def test_handler_messages_s3(
self,
modify_filename,
annotation,
notify_new,
notify_updated,
Expand Down Expand Up @@ -161,6 +167,7 @@ def test_handler_messages_s3(
assert apiclient.set_published.call_count == 2
notify_new.assert_not_called()
notify_updated.assert_not_called()
modify_filename.assert_not_called()
annotation.assert_called_with(
ANY,
automated=True,
Expand Down Expand Up @@ -820,3 +827,18 @@ def test_unquote_s3(self, getenv, os):
mock_s3_client.download_file.assert_called_with(
ANY, "2010 Reported/[2010]/1.tar.gz", ANY
)


modify_filename_data = [
["TRE-2023-XYZ.tar.gz", "TRE-2023-XYZ_.tar.gz"],
["/a/b/c.d.e", "/a/b/c_.d.e"],
[
"",
"_",
],
]


@pytest.mark.parametrize("was, now", modify_filename_data)
def test_modify_targz_filename(was, now):
assert lambda_function.modify_filename(was, addition="_") == now

0 comments on commit cbcbaa9

Please sign in to comment.