Skip to content

Commit

Permalink
Merge pull request #138 from FloRul/rich_pdf_ingestion_lambda
Browse files Browse the repository at this point in the history
new lambda
  • Loading branch information
joelbalcaen authored Apr 4, 2024
2 parents c386178 + c00db8c commit 99868c8
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lambdas/ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.vectorstores.pgvector import PGVector
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError, BotoCoreError
from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
Expand Down
50 changes: 50 additions & 0 deletions lambdas/rich_pdf_ingestion/lambda.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
locals {
lambda_function_name = "rich_pdf_ingestion"
ses_arn = "arn:aws:ses:${var.aws_region}:${data.aws_caller_identity.current.account_id}"
timeout = 30
runtime = "python3.11"
powertools_layer_arn = "arn:aws:lambda:${var.aws_region}:017000801446:layer:AWSLambdaPowertoolsPythonV2:67"
}

data "aws_caller_identity" "current" {}


module "lambda_function_container_image" {
source = "terraform-aws-modules/lambda/aws"
function_name = local.lambda_function_name
handler = "index.lambda_handler"
publish = true
runtime = local.runtime
timeout = local.timeout
layers = [local.powertools_layer_arn]
source_path = "${path.module}/src"
s3_bucket = var.lambda_storage_bucket
memory_size = 256
role_name = "${local.lambda_function_name}-role"
attach_policy_statements = true

policy_statements = {
log_group = {
effect = "Allow"
actions = [
"logs:CreateLogGroup"
]
resources = [
"arn:aws:logs:*:*:*"
]
}

log_write = {
effect = "Allow"

resources = [
"arn:aws:logs:*:*:log-group:/aws/${local.lambda_function_name}/*:*"
]

actions = [
"logs:CreateLogStream",
"logs:PutLogEvents",
]
}
}
}
7 changes: 7 additions & 0 deletions lambdas/rich_pdf_ingestion/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
output "lambda_function_arn" {
value = module.lambda_function_container_image.lambda_function_arn
}

output "lambda_function_name" {
value = module.lambda_function_container_image.lambda_function_name
}
71 changes: 71 additions & 0 deletions lambdas/rich_pdf_ingestion/src/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import tabula
import os
import boto3
from pypdf import PdfReader
import json
from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError


OBJECT_CREATED = "ObjectCreated"


def generate_text_form_pdf(pdf_file_path):
text = ""

reader = PdfReader(pdf_file_path)
for page in reader.pages:
text += page.extract_text()

dataFrames = tabula.read_pdf(pdf_file_path, pages="all",lattice=True)
for df in dataFrames:
text += df.to_html()

return text

def get_bucket_and_key(record):
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
return bucket, key

def fetch_file(bucket, key):
s3 = boto3.client("s3")
local_filename = f"/tmp/{key.split('/')[-1]}"

try:
s3.download_file(bucket, key, local_filename)
except NoCredentialsError as e:
print(e)
raise e
except BotoCoreError as e:
print(e)
raise e
except ClientError as e:
print(e)
raise e
return local_filename


def lambda_handler(event, context):
print(event)
records = json.loads(event["Records"][0]["body"])["Records"]
for record in records:
eventName = record["eventName"]
print(f"eventName: {eventName}")
try:
bucket, key = get_bucket_and_key(record)
print(f"source_bucket: {bucket}, source_key: {key}")

if eventName.startswith(OBJECT_CREATED):
local_filename = fetch_file(bucket, key)

# collection_name = bucket + "-"
# collection_name += os.path.dirname(key).replace("/", "-")

if os.path.splitext(key)[1][1:] == "pdf":
print("Extracting text from pdf")
document_text = generate_text_form_pdf(local_filename)
print(f"Extracted: {document_text}")

except Exception as e:
print(e)
raise e
4 changes: 4 additions & 0 deletions lambdas/rich_pdf_ingestion/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pypdf
tabula-py
pandas
boto3
9 changes: 9 additions & 0 deletions lambdas/rich_pdf_ingestion/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
variable "lambda_storage_bucket" {
type = string
nullable = false
}

variable "aws_region" {
type = string
nullable = false
}
6 changes: 6 additions & 0 deletions terraform/modules.tf
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,9 @@ module "email_receipt_confirmation" {
lambda_storage_bucket = aws_s3_bucket.lambda_storage.id
aws_region = var.aws_region
}

module "rich_pdf_ingestion" {
source = "../lambdas/rich_pdf_ingestion"
lambda_storage_bucket = aws_s3_bucket.lambda_storage.id
aws_region = var.aws_region
}

0 comments on commit 99868c8

Please sign in to comment.