Skip to content

Commit

Permalink
refactpr
Browse files Browse the repository at this point in the history
  • Loading branch information
joelbalcaen committed Apr 24, 2024
1 parent 16186f4 commit b6a6979
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 35 deletions.
47 changes: 22 additions & 25 deletions lambdas/rich_pdf_ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,32 @@ def lambda_handler(event, context):
print("Attachment s3 arn parsed info: ", attachment_s3_info)
bucket = attachment_s3_info["bucket"]
key = attachment_s3_info["key"]
extracted_files_s3_arns = []

if os.path.splitext(key)[1][1:] == "pdf":
local_filename = fetch_file(bucket, key)
print("Extracting text from pdf")
extracted_text = extract_text_from_pdf(local_filename)
extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
print("Finished extracting text from pdf")
base_name = Path(key).stem
new_key = f"{base_name}_extracted_pdf_content.txt"
print("Uploading file to ", new_key)
upload_file(
file_to_upload=extracted_text_local_file,
bucket=bucket,
key=new_key
)
extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}")

if os.path.splitext(key)[1][1:] != "pdf":
return {
'statusCode': 200,
'body': 'PDF text content extracted and saved',
'attachment_arns': extracted_files_s3_arns
'statusCode': 400,
'body': 'Invalid file format. Only PDF files are supported.'
}

local_filename = fetch_file(bucket, key)
extracted_text = extract_text_from_pdf(local_filename)
extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
base_name = Path(key).stem
new_key = f"{base_name}_extracted_pdf_content.txt"
upload_file(
file_to_upload=extracted_text_local_file,
bucket=bucket,
key=new_key
)
extracted_files_s3_arns = []
extracted_files_s3_arns.append(f"arn:aws:s3:::{bucket}/{new_key}")

return {
'statusCode': 200,
'body': 'PDF text content extracted and saved',
'attachment_arns': extracted_files_s3_arns
}

except Exception as e:
print(e)
return {
Expand Down Expand Up @@ -79,18 +81,13 @@ def parse_s3_arn(s3_arn):
# The first component is the bucket
bucket = components[0]

# The rest of the components form the key
key = "/".join(components[1:])

# The folder is all components of the key except the last one
folder = "/".join(components[1:-1])
filename_without_extension = os.path.splitext(os.path.basename(s3_path))[0]

return {
"bucket": bucket,
"folder": folder,
"filename_without_extension": filename_without_extension,
"key": key
}


Expand Down
42 changes: 32 additions & 10 deletions state_machines/email_form_fill/state_machine.tf
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,33 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
}
},
{
"StartAt": "Create copy of the RFP Form",
"StartAt": "Create copy of the RFP Form doc",
"States": {
"Create copy of the RFP Form": {
"Type": "Task",
"Create copy of the RFP Form doc": {
"Comment": "Copy the form to be filled into this execution's email folder",
"End": true,
"Parameters": {
"CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)",
"Bucket.$": "$.bucket",
"CopySource.$": "States.Format('{}/rfp/standard/rfp.docx', $.bucket)",
"Key.$": "States.Format('rfp/{}/formulaire_ao.docx', $.email_id)"
},
"Resource": "arn:aws:states:::aws-sdk:s3:copyObject",
"Comment": "Copy the form to be filled into this execution's email folder"
"Type": "Task"
}
}
},
{
"StartAt": "Create copy of the RFP prompts and answers JSON",
"States": {
"Create copy of the RFP prompts and answers JSON": {
"Type": "Task",
"End": true,
"Parameters": {
"Bucket.$": "$.bucket",
"CopySource.$": "States.Format('{}/rfp/standard/rfp_prompts.json', $.bucket)",
"Key.$": "States.Format('rfp/{}/rfp_prompts.json', $.email_id)"
},
"Resource": "arn:aws:states:::aws-sdk:s3:copyObject"
}
}
},
Expand Down Expand Up @@ -172,13 +187,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"Type": "Choice"
},
"Pass": {
"Comment": "Attachment is not PDF, no other processing needed. Map the input to an array just so it's easier to flatten the results of the map state.",
"End": true,
"Type": "Pass",
"Comment": "Attachment is not PDF, no other processing needed."
"Parameters": {
"arrr.$": "States.Array($)"
},
"OutputPath": "$.arrr"
},
"Rich PDF Ingestion": {
"End": true,
"OutputPath": "$.Payload",
"OutputPath": "$.Payload.attachment_arns",
"Parameters": {
"FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
"Payload": {
Expand All @@ -199,12 +217,16 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"MaxAttempts": 3
}
],
"Type": "Task"
"Type": "Task",
"End": true
}
}
},
"ItemsPath": "$.attachment_arns",
"Type": "Map"
"Type": "Map",
"ResultSelector": {
"attachment_arns.$": "$[*][*]"
}
}
}
}
Expand Down

0 comments on commit b6a6979

Please sign in to comment.