Merge pull request #138 from FloRul/rich_pdf_ingestion_lambda

new lambda
FloRul · Apr 4, 2024 · 99868c8 · 99868c8
2 parents c386178 + c00db8c
commit 99868c8
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 1 deletion.
diff --git a/lambdas/ingestion/src/index.py b/lambdas/ingestion/src/index.py
@@ -5,7 +5,7 @@
 from langchain_community.embeddings import BedrockEmbeddings
 from langchain_community.vectorstores.pgvector import PGVector
 from botocore.exceptions import ClientError
-from botocore.exceptions import NoCredentialsError, BotoCoreError
+from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter

diff --git a/lambdas/rich_pdf_ingestion/lambda.tf b/lambdas/rich_pdf_ingestion/lambda.tf
@@ -0,0 +1,50 @@
+locals {
+  lambda_function_name = "rich_pdf_ingestion"
+  ses_arn              = "arn:aws:ses:${var.aws_region}:${data.aws_caller_identity.current.account_id}"
+  timeout              = 30
+  runtime              = "python3.11"
+  powertools_layer_arn = "arn:aws:lambda:${var.aws_region}:017000801446:layer:AWSLambdaPowertoolsPythonV2:67"
+}
+
+data "aws_caller_identity" "current" {}
+
+
+module "lambda_function_container_image" {
+  source = "terraform-aws-modules/lambda/aws"
+  function_name = local.lambda_function_name
+  handler       = "index.lambda_handler"
+  publish       = true
+  runtime = local.runtime
+  timeout = local.timeout
+  layers  = [local.powertools_layer_arn]
+  source_path = "${path.module}/src"
+  s3_bucket   = var.lambda_storage_bucket
+  memory_size              = 256
+  role_name                = "${local.lambda_function_name}-role"
+  attach_policy_statements = true
+
+  policy_statements = {
+    log_group = {
+      effect = "Allow"
+      actions = [
+        "logs:CreateLogGroup"
+      ]
+      resources = [
+        "arn:aws:logs:*:*:*"
+      ]
+    }
+
+    log_write = {
+      effect = "Allow"
+
+      resources = [
+        "arn:aws:logs:*:*:log-group:/aws/${local.lambda_function_name}/*:*"
+      ]
+
+      actions = [
+        "logs:CreateLogStream",
+        "logs:PutLogEvents",
+      ]
+    }
+  }
+}
diff --git a/lambdas/rich_pdf_ingestion/output.tf b/lambdas/rich_pdf_ingestion/output.tf
@@ -0,0 +1,7 @@
+output "lambda_function_arn" {
+  value = module.lambda_function_container_image.lambda_function_arn
+}
+
+output "lambda_function_name" {
+  value = module.lambda_function_container_image.lambda_function_name
+}
diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py
@@ -0,0 +1,71 @@
+import tabula
+import os
+import boto3
+from pypdf import PdfReader
+import json
+from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError
+
+
+OBJECT_CREATED = "ObjectCreated"
+
+
+def generate_text_form_pdf(pdf_file_path):
+    text = ""
+
+    reader = PdfReader(pdf_file_path)
+    for page in reader.pages:
+        text += page.extract_text()
+
+    dataFrames = tabula.read_pdf(pdf_file_path, pages="all",lattice=True)
+    for df in dataFrames:
+        text += df.to_html()
+
+    return text
+
+def get_bucket_and_key(record):
+    bucket = record["s3"]["bucket"]["name"]
+    key = record["s3"]["object"]["key"]
+    return bucket, key
+
+def fetch_file(bucket, key):
+    s3 = boto3.client("s3")
+    local_filename = f"/tmp/{key.split('/')[-1]}"
+
+    try:
+        s3.download_file(bucket, key, local_filename)
+    except NoCredentialsError as e:
+        print(e)
+        raise e
+    except BotoCoreError as e:
+        print(e)
+        raise e
+    except ClientError as e:
+        print(e)
+        raise e
+    return local_filename
+
+
+def lambda_handler(event, context):
+    print(event)
+    records = json.loads(event["Records"][0]["body"])["Records"]
+    for record in records:
+        eventName = record["eventName"]
+        print(f"eventName: {eventName}")
+        try:
+            bucket, key = get_bucket_and_key(record)
+            print(f"source_bucket: {bucket}, source_key: {key}")
+
+            if eventName.startswith(OBJECT_CREATED):
+                local_filename = fetch_file(bucket, key)
+
+                # collection_name = bucket + "-"
+                # collection_name += os.path.dirname(key).replace("/", "-")
+
+                if os.path.splitext(key)[1][1:] == "pdf":
+                    print("Extracting text from pdf")
+                    document_text = generate_text_form_pdf(local_filename)
+                    print(f"Extracted: {document_text}")
+
+        except Exception as e:
+            print(e)
+            raise e
diff --git a/lambdas/rich_pdf_ingestion/src/requirements.txt b/lambdas/rich_pdf_ingestion/src/requirements.txt
@@ -0,0 +1,4 @@
+pypdf
+tabula-py
+pandas
+boto3
diff --git a/lambdas/rich_pdf_ingestion/variables.tf b/lambdas/rich_pdf_ingestion/variables.tf
@@ -0,0 +1,9 @@
+variable "lambda_storage_bucket" {
+  type     = string
+  nullable = false
+}
+
+variable "aws_region" {
+  type     = string
+  nullable = false
+}
diff --git a/terraform/modules.tf b/terraform/modules.tf
@@ -224,3 +224,9 @@ module "email_receipt_confirmation" {
   lambda_storage_bucket = aws_s3_bucket.lambda_storage.id
   aws_region            = var.aws_region
 }
+
+module "rich_pdf_ingestion" {
+  source                = "../lambdas/rich_pdf_ingestion"
+  lambda_storage_bucket = aws_s3_bucket.lambda_storage.id
+  aws_region            = var.aws_region
+}