Skip to content

Commit

Permalink
refactir
Browse files Browse the repository at this point in the history
  • Loading branch information
joelbalcaen committed Apr 24, 2024
1 parent 9f29131 commit 802ff40
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 29 deletions.
11 changes: 3 additions & 8 deletions lambdas/email_attachment_saver/src/index.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import boto3
import email
from botocore.exceptions import NoCredentialsError
from aws_lambda_powertools import Logger, Metrics
from aws_lambda_powertools import Logger

logger = Logger()
metrics = Metrics()
s3 = boto3.client('s3')


@logger.inject_lambda_context
def lambda_handler(event, context):
"""
This lambda downloads an email MIME file from S3, extracts its attachments, and saves them to another S3 folder.
Expand Down Expand Up @@ -39,11 +38,7 @@ def lambda_handler(event, context):
key = "/".join([s3_folder,part.get_filename()])
logger.info(f"Putting object in bucket:{bucket} and key:{key}")
s3.put_object(Bucket=bucket, Key=key, Body=part.get_payload(decode=True))
file_extension = key.split('.')[-1] if '.' in key else None
attachment_arns.append({
'path': 'arn:aws:s3:::' + bucket + '/' + key,
'extension': file_extension
})
attachment_arns.append('arn:aws:s3:::' + bucket + '/' + key,)

except NoCredentialsError:
logger.error('No AWS credentials found')
Expand Down
34 changes: 23 additions & 11 deletions lambdas/rich_pdf_ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ def upload_file(file_to_upload, bucket, key,):
with open(file_to_upload, "rb") as f:
s3.upload_fileobj(f, bucket, key)

print(f"Stored file {s3_object_key} in bucket {bucket}")


def store_extracted_text_in_local_file(extracted_text):
local_file_path = f"{PATH_TO_WRITE_FILES}/{str(uuid.uuid4())}"
Expand All @@ -76,27 +74,41 @@ def lambda_handler(event, context):
attachment_s3_arn = event['path']

try:
s3_info = parse_s3_arn(attachment_s3_arn)
bucket = s3_info["bucket"]
folder = s3_info['folder']
key = s3_info["key"]
filename_without_extension = s3_info['filename_without_extension']
print(f"source_bucket: {bucket}, source_key: {key}")
attachment_s3_info = parse_s3_arn(attachment_s3_arn)
print("Attachment s3 arn parsed info: ", attachment_s3_info)
bucket = attachment_s3_info["bucket"]
folder = attachment_s3_info['folder']
key = attachment_s3_info["key"]
filename_without_extension = attachment_s3_info['filename_without_extension']
extracted_files_s3_arns = []

if os.path.splitext(key)[1][1:] == "pdf":
local_filename = fetch_file(bucket, key)
print("Extracting text from pdf")
extracted_text = extract_text_from_pdf(local_filename)
extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
extracted_text_local_file = store_extracted_text_in_local_file(
extracted_text)
print("Finished extracting text from pdf")
extracted_text_s3_key = "/".join([folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"])
extracted_text_s3_key = "/".join(
[folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"])
print("Uploading file to ", extracted_text_s3_key)
upload_file(
file_to_upload=extracted_text_local_file,
bucket=bucket,
key=extracted_text_s3_key
)
extracted_files_s3_arns.append(
f"arn:aws:s3:::{bucket}/{extracted_text_s3_key}")

return {
'statusCode': 200,
'body': 'PDF text content extracted and saved',
'attachment_arns': extracted_files_s3_arns
}

except Exception as e:
print(e)
raise e
return {
'statusCode': 400,
'body': e
}
20 changes: 10 additions & 10 deletions state_machines/email_form_fill/state_machine.tf
Original file line number Diff line number Diff line change
Expand Up @@ -137,29 +137,30 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"Type": "Task"
},
"Map": {
"Type": "Map",
"End": true,
"ItemProcessor": {
"ProcessorConfig": {
"Mode": "INLINE"
},
"StartAt": "Choice",
"States": {
"Choice": {
"Type": "Choice",
"Choices": [
{
"Variable": "$.extension",
"StringEquals": "pdf",
"Variable": "$",
"StringMatches": "*.pdf",
"Next": "Rich PDF Ingestion"
}
],
"Default": "Pass"
"Default": "Pass",
"Type": "Choice"
},
"Pass": {
"Type": "Pass",
"End": true
"End": true,
"Type": "Pass"
},
"Rich PDF Ingestion": {
"End": true,
"OutputPath": "$.Payload",
"Parameters": {
"FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
Expand All @@ -179,13 +180,12 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"MaxAttempts": 3
}
],
"Type": "Task",
"End": true
"Type": "Task"
}
}
},
"ItemsPath": "$.attachment_arns",
"End": true
"Type": "Map"
}
}
}
Expand Down

0 comments on commit 802ff40

Please sign in to comment.