Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
joelbalcaen committed May 3, 2024
1 parent 299d1ed commit 7eca90d
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 6 deletions.
4 changes: 2 additions & 2 deletions lambdas/docx_ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def lambda_handler(event, context):
"""
Downloads the given docx file from S3, extracts the text content and saves it as a txt file in the same bucket, adjacent to the docx file.
Downloads the given docx file from S3, extracts the text content and saves it as a txt file in the same bucket, adjacent to the original docx file.
"""
try:
print(event)
Expand All @@ -20,7 +20,7 @@ def lambda_handler(event, context):
if os.path.splitext(key)[1][1:] != "docx":
return {
'statusCode': 400,
'body': 'Invalid file format. Only PDF files are supported.'
'body': 'Invalid file format. Only docx files are supported.'
}

downloaded_docx_path = fetch_file(bucket, key)
Expand Down
2 changes: 1 addition & 1 deletion lambdas/rich_pdf_ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def lambda_handler(event, context):
return {
'statusCode': 200,
'body': 'PDF text content extracted and saved',
'attachment_uris': extracted_files_s3_uris
'extracted_file_uris': extracted_files_s3_uris
}

except Exception as e:
Expand Down
39 changes: 36 additions & 3 deletions state_machines/rfp_email_form_fill/state_machine.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
"ItemSelector": {
"system_prompt.$": "$.get_prompts_output.parsed_JSON.system_prompt",
"prompt.$": "$$.Map.Item.Value",
"s3_uris.$": "$.parsed_attachments_for_llm_ouput.attachment_arns"
"s3_uris.$": "$.parsed_attachments_for_llm_ouput.attachment_uris"
},
"ResultPath": "$.prompt_responses",
"Next": "Map claude response to document filler argument",
Expand Down Expand Up @@ -251,6 +251,11 @@
"Variable": "$",
"StringMatches": "*.txt",
"Next": "Do nothing on supported file type (txt)"
},
{
"Variable": "$",
"StringMatches": "*.docx",
"Next": "Extract text from DOCX"
}
],
"Default": "Filter out unsupported attachment",
Expand All @@ -268,7 +273,7 @@
},
"Extract text from PDF": {
"End": true,
"OutputPath": "$.Payload.attachment_uris",
"OutputPath": "$.Payload.extracted_file_uris",
"Parameters": {
"FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
"Payload": {
Expand Down Expand Up @@ -298,13 +303,41 @@
"arrr.$": "States.Array($)"
},
"OutputPath": "$.arrr"
},
"Extract text from DOCX": {
"Type": "Task",
"Resource": "arn:aws:states:::lambda:invoke",
"OutputPath": "$.extracted_text",
"Parameters": {
"FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:levio-esta-docx-ingestion-dev:$LATEST",
"Payload": {
"docx_s3_uri.$": "$"
}
},
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 1,
"MaxAttempts": 3,
"BackoffRate": 2
}
],
"End": true,
"ResultSelector": {
"extracted_text.$": "States.Array($.Payload.attachment_uri)"
}
}
}
},
"ItemsPath": "$.download_email_attachments_ouput.Payload.attachments",
"Next": "Get Prompts",
"ResultSelector": {
"attachment_arns.$": "$[*][*]"
"attachment_uris.$": "$[*][*]"
},
"Type": "Map",
"ResultPath": "$.parsed_attachments_for_llm_ouput"
Expand Down

0 comments on commit 7eca90d

Please sign in to comment.