update

FloRul · May 3, 2024 · 7eca90d · 7eca90d
1 parent 299d1ed
commit 7eca90d
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 6 deletions.
diff --git a/lambdas/docx_ingestion/src/index.py b/lambdas/docx_ingestion/src/index.py
@@ -9,7 +9,7 @@
 
 def lambda_handler(event, context):
     """
-    Downloads the given docx file from S3, extracts the text content and saves it as a txt file in the same bucket, adjacent to the docx file.
+    Downloads the given docx file from S3, extracts the text content and saves it as a txt file in the same bucket, adjacent to the original docx file.
     """
     try:
         print(event)
@@ -20,7 +20,7 @@ def lambda_handler(event, context):
         if os.path.splitext(key)[1][1:] != "docx":
             return {
                 'statusCode': 400,
-                'body': 'Invalid file format. Only PDF files are supported.'
+                'body': 'Invalid file format. Only docx files are supported.'
             }
 
         downloaded_docx_path = fetch_file(bucket, key)

diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py
@@ -45,7 +45,7 @@ def lambda_handler(event, context):
         return {
             'statusCode': 200,
             'body': 'PDF text content extracted and saved',
-            'attachment_uris': extracted_files_s3_uris
+            'extracted_file_uris': extracted_files_s3_uris
         }
 
     except Exception as e:

diff --git a/state_machines/rfp_email_form_fill/state_machine.json b/state_machines/rfp_email_form_fill/state_machine.json
@@ -128,7 +128,7 @@
               "ItemSelector": {
                 "system_prompt.$": "$.get_prompts_output.parsed_JSON.system_prompt",
                 "prompt.$": "$$.Map.Item.Value",
-                "s3_uris.$": "$.parsed_attachments_for_llm_ouput.attachment_arns"
+                "s3_uris.$": "$.parsed_attachments_for_llm_ouput.attachment_uris"
               },
               "ResultPath": "$.prompt_responses",
               "Next": "Map claude response to document filler argument",
@@ -251,6 +251,11 @@
                         "Variable": "$",
                         "StringMatches": "*.txt",
                         "Next": "Do nothing on supported file type (txt)"
+                      },
+                      {
+                        "Variable": "$",
+                        "StringMatches": "*.docx",
+                        "Next": "Extract text from DOCX"
                       }
                     ],
                     "Default": "Filter out unsupported attachment",
@@ -268,7 +273,7 @@
                   },
                   "Extract text from PDF": {
                     "End": true,
-                    "OutputPath": "$.Payload.attachment_uris",
+                    "OutputPath": "$.Payload.extracted_file_uris",
                     "Parameters": {
                       "FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
                       "Payload": {
@@ -298,13 +303,41 @@
                       "arrr.$": "States.Array($)"
                     },
                     "OutputPath": "$.arrr"
+                  },
+                  "Extract text from DOCX": {
+                    "Type": "Task",
+                    "Resource": "arn:aws:states:::lambda:invoke",
+                    "OutputPath": "$.extracted_text",
+                    "Parameters": {
+                      "FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:levio-esta-docx-ingestion-dev:$LATEST",
+                      "Payload": {
+                        "docx_s3_uri.$": "$"
+                      }
+                    },
+                    "Retry": [
+                      {
+                        "ErrorEquals": [
+                          "Lambda.ServiceException",
+                          "Lambda.AWSLambdaException",
+                          "Lambda.SdkClientException",
+                          "Lambda.TooManyRequestsException"
+                        ],
+                        "IntervalSeconds": 1,
+                        "MaxAttempts": 3,
+                        "BackoffRate": 2
+                      }
+                    ],
+                    "End": true,
+                    "ResultSelector": {
+                      "extracted_text.$": "States.Array($.Payload.attachment_uri)"
+                    }
                   }
                 }
               },
               "ItemsPath": "$.download_email_attachments_ouput.Payload.attachments",
               "Next": "Get Prompts",
               "ResultSelector": {
-                "attachment_arns.$": "$[*][*]"
+                "attachment_uris.$": "$[*][*]"
               },
               "Type": "Map",
               "ResultPath": "$.parsed_attachments_for_llm_ouput"