diff --git a/lambdas/email_attachment_saver/src/index.py b/lambdas/email_attachment_saver/src/index.py index 21a8923..f110724 100644 --- a/lambdas/email_attachment_saver/src/index.py +++ b/lambdas/email_attachment_saver/src/index.py @@ -1,13 +1,12 @@ import boto3 import email from botocore.exceptions import NoCredentialsError -from aws_lambda_powertools import Logger, Metrics +from aws_lambda_powertools import Logger logger = Logger() -metrics = Metrics() s3 = boto3.client('s3') - +@logger.inject_lambda_context def lambda_handler(event, context): """ This lambda downloads an email MIME file from S3, extracts its attachments, and saves them to another S3 folder. @@ -39,11 +38,7 @@ def lambda_handler(event, context): key = "/".join([s3_folder,part.get_filename()]) logger.info(f"Putting object in bucket:{bucket} and key:{key}") s3.put_object(Bucket=bucket, Key=key, Body=part.get_payload(decode=True)) - file_extension = key.split('.')[-1] if '.' in key else None - attachment_arns.append({ - 'path': 'arn:aws:s3:::' + bucket + '/' + key, - 'extension': file_extension - }) + attachment_arns.append('arn:aws:s3:::' + bucket + '/' + key,) except NoCredentialsError: logger.error('No AWS credentials found') diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py index 481bc73..be51160 100644 --- a/lambdas/rich_pdf_ingestion/src/index.py +++ b/lambdas/rich_pdf_ingestion/src/index.py @@ -59,8 +59,6 @@ def upload_file(file_to_upload, bucket, key,): with open(file_to_upload, "rb") as f: s3.upload_fileobj(f, bucket, key) - print(f"Stored file {s3_object_key} in bucket {bucket}") - def store_extracted_text_in_local_file(extracted_text): local_file_path = f"{PATH_TO_WRITE_FILES}/{str(uuid.uuid4())}" @@ -76,27 +74,41 @@ def lambda_handler(event, context): attachment_s3_arn = event['path'] try: - s3_info = parse_s3_arn(attachment_s3_arn) - bucket = s3_info["bucket"] - folder = s3_info['folder'] - key = s3_info["key"] - filename_without_extension = s3_info['filename_without_extension'] - print(f"source_bucket: {bucket}, source_key: {key}") + attachment_s3_info = parse_s3_arn(attachment_s3_arn) + print("Attachment s3 arn parsed info: ", attachment_s3_info) + bucket = attachment_s3_info["bucket"] + folder = attachment_s3_info['folder'] + key = attachment_s3_info["key"] + filename_without_extension = attachment_s3_info['filename_without_extension'] + extracted_files_s3_arns = [] if os.path.splitext(key)[1][1:] == "pdf": local_filename = fetch_file(bucket, key) print("Extracting text from pdf") extracted_text = extract_text_from_pdf(local_filename) - extracted_text_local_file = store_extracted_text_in_local_file(extracted_text) + extracted_text_local_file = store_extracted_text_in_local_file( + extracted_text) print("Finished extracting text from pdf") - extracted_text_s3_key = "/".join([folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"]) + extracted_text_s3_key = "/".join( + [folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"]) print("Uploading file to ", extracted_text_s3_key) upload_file( file_to_upload=extracted_text_local_file, bucket=bucket, key=extracted_text_s3_key ) + extracted_files_s3_arns.append( + f"arn:aws:s3:::{bucket}/{extracted_text_s3_key}") + + return { + 'statusCode': 200, + 'body': 'PDF text content extracted and saved', + 'attachment_arns': extracted_files_s3_arns + } except Exception as e: print(e) - raise e + return { + 'statusCode': 400, + 'body': e + } diff --git a/state_machines/email_form_fill/state_machine.tf b/state_machines/email_form_fill/state_machine.tf index 8426933..fa0c601 100644 --- a/state_machines/email_form_fill/state_machine.tf +++ b/state_machines/email_form_fill/state_machine.tf @@ -137,7 +137,7 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { "Type": "Task" }, "Map": { - "Type": "Map", + "End": true, "ItemProcessor": { "ProcessorConfig": { "Mode": "INLINE" @@ -145,21 +145,22 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { "StartAt": "Choice", "States": { "Choice": { - "Type": "Choice", "Choices": [ { - "Variable": "$.extension", - "StringEquals": "pdf", + "Variable": "$", + "StringMatches": "*.pdf", "Next": "Rich PDF Ingestion" } ], - "Default": "Pass" + "Default": "Pass", + "Type": "Choice" }, "Pass": { - "Type": "Pass", - "End": true + "End": true, + "Type": "Pass" }, "Rich PDF Ingestion": { + "End": true, "OutputPath": "$.Payload", "Parameters": { "FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST", @@ -179,13 +180,12 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { "MaxAttempts": 3 } ], - "Type": "Task", - "End": true + "Type": "Task" } } }, "ItemsPath": "$.attachment_arns", - "End": true + "Type": "Map" } } }