refactpr

FloRul · Apr 24, 2024 · b6a6979 · b6a6979
1 parent 16186f4
commit b6a6979
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 35 deletions.
diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py
@@ -23,30 +23,32 @@ def lambda_handler(event, context):
         print("Attachment s3 arn parsed info: ", attachment_s3_info)
         bucket = attachment_s3_info["bucket"]
         key = attachment_s3_info["key"]
-        extracted_files_s3_arns = []
-
-        if os.path.splitext(key)[1][1:] == "pdf":
-            local_filename = fetch_file(bucket, key)
-            print("Extracting text from pdf")
-            extracted_text = extract_text_from_pdf(local_filename)
-            extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
-            print("Finished extracting text from pdf")
-            base_name = Path(key).stem
-            new_key = f"{base_name}_extracted_pdf_content.txt"
-            print("Uploading file to ", new_key)
-            upload_file(
-                file_to_upload=extracted_text_local_file,
-                bucket=bucket,
-                key=new_key
-            )
-            extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}")
 
+        if os.path.splitext(key)[1][1:] != "pdf":
             return {
-                'statusCode': 200,
-                'body': 'PDF text content extracted and saved',
-                'attachment_arns': extracted_files_s3_arns
+                'statusCode': 400,
+                'body': 'Invalid file format. Only PDF files are supported.'
             }
 
+        local_filename = fetch_file(bucket, key)
+        extracted_text = extract_text_from_pdf(local_filename)
+        extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
+        base_name = Path(key).stem
+        new_key = f"{base_name}_extracted_pdf_content.txt"
+        upload_file(
+            file_to_upload=extracted_text_local_file,
+            bucket=bucket,
+            key=new_key
+        )
+        extracted_files_s3_arns = []
+        extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}")
+
+        return {
+            'statusCode': 200,
+            'body': 'PDF text content extracted and saved',
+            'attachment_arns': extracted_files_s3_arns
+        }
+
     except Exception as e:
         print(e)
         return {
@@ -79,18 +81,13 @@ def parse_s3_arn(s3_arn):
     # The first component is the bucket
     bucket = components[0]
 
-    # The rest of the components form the key
-    key = "/".join(components[1:])
 
     # The folder is all components of the key except the last one
     folder = "/".join(components[1:-1])
-    filename_without_extension = os.path.splitext(os.path.basename(s3_path))[0]
 
     return {
         "bucket": bucket,
         "folder": folder,
-        "filename_without_extension": filename_without_extension,
-        "key": key
     }
 
 

diff --git a/state_machines/email_form_fill/state_machine.tf b/state_machines/email_form_fill/state_machine.tf
@@ -106,18 +106,33 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
           }
         },
         {
-          "StartAt": "Create copy of the RFP Form",
+          "StartAt": "Create copy of the RFP Form doc",
           "States": {
-            "Create copy of the RFP Form": {
-              "Type": "Task",
+            "Create copy of the RFP Form doc": {
+              "Comment": "Copy the form to be filled into this execution's email folder",
               "End": true,
               "Parameters": {
-                "CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)",
                 "Bucket.$": "$.bucket",
+                "CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)",
                 "Key.$": "States.Format('rfp/{}/formulaire_ao.docx', $.email_id)"
               },
               "Resource": "arn:aws:states:::aws-sdk:s3:copyObject",
-              "Comment": "Copy the form to be filled into this execution's email folder"
+              "Type": "Task"
+            }
+          }
+        },
+        {
+          "StartAt": "Create copy of the RFP prompts and answers JSON",
+          "States": {
+            "Create copy of the RFP prompts and answers JSON": {
+              "Type": "Task",
+              "End": true,
+              "Parameters": {
+                "Bucket.$": "$.bucket",
+                "CopySource.$": "States.Format('{}/rfp/standard/rfp_prompts.json', $.bucket)",
+                "Key.$": "States.Format('rfp/{}/rfp_prompts.json', $.email_id)"
+              },
+              "Resource": "arn:aws:states:::aws-sdk:s3:copyObject"
             }
           }
         },
@@ -172,13 +187,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
                     "Type": "Choice"
                   },
                   "Pass": {
+                    "Comment": "Attachment is not PDF, no other processing needed. Map the input to an array just so it's easier to flatten the results of the map state.",
                     "End": true,
                     "Type": "Pass",
-                    "Comment": "Attachment is not PDF, no other processing needed."
+                    "Parameters": {
+                      "arrr.$": "States.Array($)"
+                    },
+                    "OutputPath": "$.arrr"
                   },
                   "Rich PDF Ingestion": {
-                    "End": true,
-                    "OutputPath": "$.Payload",
+                    "OutputPath": "$.Payload.attachment_arns",
                     "Parameters": {
                       "FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
                       "Payload": {
@@ -199,12 +217,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
                         "MaxAttempts": 3
                       }
                     ],
-                    "Type": "Task"
+                    "Type": "Task",
+                    "End": true
                   }
                 }
               },
               "ItemsPath": "$.attachment_arns",
-              "Type": "Map"
+              "Type": "Map",
+              "ResultSelector": {
+                "attachment_arns.$": "$[*][*]"
+              }
             }
           }
         }