Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
joelbalcaen committed Apr 23, 2024
1 parent 8f7add8 commit 155bee5
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 33 deletions.
67 changes: 46 additions & 21 deletions lambdas/rich_pdf_ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import boto3
from pypdf import PdfReader
import uuid

OBJECT_CREATED = "ObjectCreated"
EXTRACTED_TEXT_S3_OBJECT_KEY_PREFIX = 'pdf_extraction_result'
Expand All @@ -23,10 +24,29 @@ def extract_text_from_pdf(pdf_file_path):
return text


def get_bucket_and_key(record):
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
return bucket, key
def parse_s3_arn(s3_arn):
# Remove the ARN prefix
s3_path = s3_arn.replace("arn:aws:s3:::", "")

# Split the path into components
components = s3_path.split("/")

# The first component is the bucket
bucket = components[0]

# The rest of the components form the key
key = "/".join(components[1:])

# The folder is all components of the key except the last one
folder = "/".join(components[1:-1])
filename_without_extension = os.path.splitext(os.path.basename(s3_path))[0]

return {
"bucket": bucket,
"folder": folder,
"filename_without_extension": filename_without_extension,
"key": key
}


def fetch_file(bucket, key):
Expand All @@ -35,42 +55,47 @@ def fetch_file(bucket, key):
return local_filename


def upload_text(extracted_text, bucket, key):
file_name = os.path.splitext(
os.path.basename(key)
)[0] + "_pdf_extracted_text.txt"
def upload_file(file_to_upload, bucket, key,):
with open(file_to_upload, "rb") as f:
s3.upload_fileobj(f, bucket, key)

print(f"Stored file {s3_object_key} in bucket {bucket}")

local_file_path = f"{PATH_TO_WRITE_FILES}/{file_name}"

# build a new object key in an adjacent folder
parts = key.split('/')
parts.insert(-1, EXTRACTED_TEXT_S3_OBJECT_KEY_PREFIX)
s3_object_key = '/'.join(parts[:-1] + [file_name])
def store_extracted_text_in_local_file(extracted_text):
local_file_path = f"{PATH_TO_WRITE_FILES}/{str(uuid.uuid4())}"

with open(local_file_path, "w") as f:
f.write(extracted_text)

with open(local_file_path, "rb") as f:
s3.upload_fileobj(f, bucket, s3_object_key)

print(f"Stored file {s3_object_key} in bucket {bucket}")
return local_file_path


def lambda_handler(event, context):
print(event)
attachment_path = event['path']

attachment_s3_arn = event['path']

try:
bucket, key = get_bucket_and_key(attachment_path)
s3_info = parse_s3_arn(attachment_s3_arn)
bucket = s3_info["bucket"]
folder = s3_info['folder']
key = s3_info["key"]
filename_without_extension = s3_info['filename_without_extension']
print(f"source_bucket: {bucket}, source_key: {key}")

if os.path.splitext(key)[1][1:] == "pdf":
local_filename = fetch_file(bucket, key)
print("Extracting text from pdf")
extracted_text = extract_text_from_pdf(local_filename)
extracted_text_local_file = store_extracted_text_in_local_file(extracted_text)
print("Finished extracting text from pdf")
upload_text(extracted_text, bucket, key)
extracted_text_s3_key = "/".join([folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"])
print("Uploading file to ", extracted_text_s3_key)
upload_file(
file_to_upload=extracted_text_local_file,
bucket=bucket,
key=extracted_text_s3_key
)

except Exception as e:
print(e)
Expand Down
35 changes: 23 additions & 12 deletions state_machines/email_form_fill/state_machine.tf
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"States": {
"Download email attachments": {
"Comment": "Extract attachments from a raw email MIME file and stores them in S3",
"Next": "Filter PDF attachments",
"Next": "Map",
"OutputPath": "$.Payload",
"Parameters": {
"FunctionName": "email-attachment-saver-dev",
Expand All @@ -136,16 +136,30 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
],
"Type": "Task"
},
"Extract Text/Tables/Images from PDF attachments": {
"End": true,
"Map": {
"Type": "Map",
"ItemProcessor": {
"ProcessorConfig": {
"Mode": "INLINE"
},
"StartAt": "Rich PDF Ingestion",
"StartAt": "Choice",
"States": {
"Choice": {
"Type": "Choice",
"Choices": [
{
"Variable": "$.extension",
"StringEquals": "pdf",
"Next": "Rich PDF Ingestion"
}
],
"Default": "Pass"
},
"Pass": {
"Type": "Pass",
"End": true
},
"Rich PDF Ingestion": {
"End": true,
"OutputPath": "$.Payload",
"Parameters": {
"FunctionName": "arn:aws:lambda:us-east-1:446872271111:function:rich_pdf_ingestion:$LATEST",
Expand All @@ -165,16 +179,13 @@ resource "aws_sfn_state_machine" "sfn_state_machine" {
"MaxAttempts": 3
}
],
"Type": "Task"
"Type": "Task",
"End": true
}
}
},
"Type": "Map"
},
"Filter PDF attachments": {
"InputPath": "$..attachment_arns[?(@.extension==pdf)]",
"Next": "Extract Text/Tables/Images from PDF attachments",
"Type": "Pass"
"ItemsPath": "$.attachment_arns",
"End": true
}
}
}
Expand Down

0 comments on commit 155bee5

Please sign in to comment.