diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py index 79d6a48..4ee82df 100644 --- a/lambdas/rich_pdf_ingestion/src/index.py +++ b/lambdas/rich_pdf_ingestion/src/index.py @@ -23,30 +23,32 @@ def lambda_handler(event, context): print("Attachment s3 arn parsed info: ", attachment_s3_info) bucket = attachment_s3_info["bucket"] key = attachment_s3_info["key"] - extracted_files_s3_arns = [] - - if os.path.splitext(key)[1][1:] == "pdf": - local_filename = fetch_file(bucket, key) - print("Extracting text from pdf") - extracted_text = extract_text_from_pdf(local_filename) - extracted_text_local_file = store_extracted_text_in_local_file(extracted_text) - print("Finished extracting text from pdf") - base_name = Path(key).stem - new_key = f"{base_name}_extracted_pdf_content.txt" - print("Uploading file to ", new_key) - upload_file( - file_to_upload=extracted_text_local_file, - bucket=bucket, - key=new_key - ) - extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}") + if os.path.splitext(key)[1][1:] != "pdf": return { - 'statusCode': 200, - 'body': 'PDF text content extracted and saved', - 'attachment_arns': extracted_files_s3_arns + 'statusCode': 400, + 'body': 'Invalid file format. Only PDF files are supported.' } + local_filename = fetch_file(bucket, key) + extracted_text = extract_text_from_pdf(local_filename) + extracted_text_local_file = store_extracted_text_in_local_file(extracted_text) + base_name = Path(key).stem + new_key = f"{base_name}_extracted_pdf_content.txt" + upload_file( + file_to_upload=extracted_text_local_file, + bucket=bucket, + key=new_key + ) + extracted_files_s3_arns = [] + extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}") + + return { + 'statusCode': 200, + 'body': 'PDF text content extracted and saved', + 'attachment_arns': extracted_files_s3_arns + } + except Exception as e: print(e) return { @@ -79,18 +81,13 @@ def parse_s3_arn(s3_arn): # The first component is the bucket bucket = components[0] - # The rest of the components form the key - key = "/".join(components[1:]) # The folder is all components of the key except the last one folder = "/".join(components[1:-1]) - filename_without_extension = os.path.splitext(os.path.basename(s3_path))[0] return { "bucket": bucket, "folder": folder, - "filename_without_extension": filename_without_extension, - "key": key } diff --git a/state_machines/email_form_fill/state_machine.tf b/state_machines/email_form_fill/state_machine.tf index 77f01bb..5526349 100644 --- a/state_machines/email_form_fill/state_machine.tf +++ b/state_machines/email_form_fill/state_machine.tf @@ -106,18 +106,33 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { } }, { - "StartAt": "Create copy of the RFP Form", + "StartAt": "Create copy of the RFP Form doc", "States": { - "Create copy of the RFP Form": { - "Type": "Task", + "Create copy of the RFP Form doc": { + "Comment": "Copy the form to be filled into this execution's email folder", "End": true, "Parameters": { - "CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)", "Bucket.$": "$.bucket", + "CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)", "Key.$": "States.Format('rfp/{}/formulaire_ao.docx', $.email_id)" }, "Resource": "arn:aws:states:::aws-sdk:s3:copyObject", - "Comment": "Copy the form to be filled into this execution's email folder" + "Type": "Task" + } + } + }, + { + "StartAt": "Create copy of the RFP prompts and answers JSON", + "States": { + "Create copy of the RFP prompts and answers JSON": { + "Type": "Task", + "End": true, + "Parameters": { + "Bucket.$": "$.bucket", + "CopySource.$": "States.Format('{}/rfp/standard/rfp_prompts.json', $.bucket)", + "Key.$": "States.Format('rfp/{}/rfp_prompts.json', $.email_id)" + }, + "Resource": "arn:aws:states:::aws-sdk:s3:copyObject" } } }, @@ -172,13 +187,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { "Type": "Choice" }, "Pass": { + "Comment": "Attachment is not PDF, no other processing needed. Map the input to an array just so it's easier to flatten the results of the map state.", "End": true, "Type": "Pass", - "Comment": "Attachment is not PDF, no other processing needed." + "Parameters": { + "arrr.$": "States.Array($)" + }, + "OutputPath": "$.arrr" }, "Rich PDF Ingestion": { - "End": true, - "OutputPath": "$.Payload", + "OutputPath": "$.Payload.attachment_arns", "Parameters": { "FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST", "Payload": { @@ -199,12 +217,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" { "MaxAttempts": 3 } ], - "Type": "Task" + "Type": "Task", + "End": true } } }, "ItemsPath": "$.attachment_arns", - "Type": "Map" + "Type": "Map", + "ResultSelector": { + "attachment_arns.$": "$[*][*]" + } } } }