diff --git a/lambdas/ingestion/src/index.py b/lambdas/ingestion/src/index.py index 518f469..afb114f 100644 --- a/lambdas/ingestion/src/index.py +++ b/lambdas/ingestion/src/index.py @@ -5,7 +5,7 @@ from langchain_community.embeddings import BedrockEmbeddings from langchain_community.vectorstores.pgvector import PGVector from botocore.exceptions import ClientError -from botocore.exceptions import NoCredentialsError, BotoCoreError +from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from langchain.text_splitter import RecursiveCharacterTextSplitter diff --git a/lambdas/rich_pdf_ingestion/lambda.tf b/lambdas/rich_pdf_ingestion/lambda.tf new file mode 100644 index 0000000..fca1f14 --- /dev/null +++ b/lambdas/rich_pdf_ingestion/lambda.tf @@ -0,0 +1,50 @@ +locals { + lambda_function_name = "rich_pdf_ingestion" + ses_arn = "arn:aws:ses:${var.aws_region}:${data.aws_caller_identity.current.account_id}" + timeout = 30 + runtime = "python3.11" + powertools_layer_arn = "arn:aws:lambda:${var.aws_region}:017000801446:layer:AWSLambdaPowertoolsPythonV2:67" +} + +data "aws_caller_identity" "current" {} + + +module "lambda_function_container_image" { + source = "terraform-aws-modules/lambda/aws" + function_name = local.lambda_function_name + handler = "index.lambda_handler" + publish = true + runtime = local.runtime + timeout = local.timeout + layers = [local.powertools_layer_arn] + source_path = "${path.module}/src" + s3_bucket = var.lambda_storage_bucket + memory_size = 256 + role_name = "${local.lambda_function_name}-role" + attach_policy_statements = true + + policy_statements = { + log_group = { + effect = "Allow" + actions = [ + "logs:CreateLogGroup" + ] + resources = [ + "arn:aws:logs:*:*:*" + ] + } + + log_write = { + effect = "Allow" + + resources = [ + "arn:aws:logs:*:*:log-group:/aws/${local.lambda_function_name}/*:*" + ] + + actions = [ + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + } + } +} diff --git a/lambdas/rich_pdf_ingestion/output.tf b/lambdas/rich_pdf_ingestion/output.tf new file mode 100644 index 0000000..cd2e3af --- /dev/null +++ b/lambdas/rich_pdf_ingestion/output.tf @@ -0,0 +1,7 @@ +output "lambda_function_arn" { + value = module.lambda_function_container_image.lambda_function_arn +} + +output "lambda_function_name" { + value = module.lambda_function_container_image.lambda_function_name +} diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py new file mode 100644 index 0000000..ba2b40d --- /dev/null +++ b/lambdas/rich_pdf_ingestion/src/index.py @@ -0,0 +1,71 @@ +import tabula +import os +import boto3 +from pypdf import PdfReader +import json +from botocore.exceptions import NoCredentialsError, BotoCoreError, ClientError + + +OBJECT_CREATED = "ObjectCreated" + + +def generate_text_form_pdf(pdf_file_path): + text = "" + + reader = PdfReader(pdf_file_path) + for page in reader.pages: + text += page.extract_text() + + dataFrames = tabula.read_pdf(pdf_file_path, pages="all",lattice=True) + for df in dataFrames: + text += df.to_html() + + return text + +def get_bucket_and_key(record): + bucket = record["s3"]["bucket"]["name"] + key = record["s3"]["object"]["key"] + return bucket, key + +def fetch_file(bucket, key): + s3 = boto3.client("s3") + local_filename = f"/tmp/{key.split('/')[-1]}" + + try: + s3.download_file(bucket, key, local_filename) + except NoCredentialsError as e: + print(e) + raise e + except BotoCoreError as e: + print(e) + raise e + except ClientError as e: + print(e) + raise e + return local_filename + + +def lambda_handler(event, context): + print(event) + records = json.loads(event["Records"][0]["body"])["Records"] + for record in records: + eventName = record["eventName"] + print(f"eventName: {eventName}") + try: + bucket, key = get_bucket_and_key(record) + print(f"source_bucket: {bucket}, source_key: {key}") + + if eventName.startswith(OBJECT_CREATED): + local_filename = fetch_file(bucket, key) + + # collection_name = bucket + "-" + # collection_name += os.path.dirname(key).replace("/", "-") + + if os.path.splitext(key)[1][1:] == "pdf": + print("Extracting text from pdf") + document_text = generate_text_form_pdf(local_filename) + print(f"Extracted: {document_text}") + + except Exception as e: + print(e) + raise e diff --git a/lambdas/rich_pdf_ingestion/src/requirements.txt b/lambdas/rich_pdf_ingestion/src/requirements.txt new file mode 100644 index 0000000..ccb24ab --- /dev/null +++ b/lambdas/rich_pdf_ingestion/src/requirements.txt @@ -0,0 +1,4 @@ +pypdf +tabula-py +pandas +boto3 diff --git a/lambdas/rich_pdf_ingestion/variables.tf b/lambdas/rich_pdf_ingestion/variables.tf new file mode 100644 index 0000000..0fadc92 --- /dev/null +++ b/lambdas/rich_pdf_ingestion/variables.tf @@ -0,0 +1,9 @@ +variable "lambda_storage_bucket" { + type = string + nullable = false +} + +variable "aws_region" { + type = string + nullable = false +} \ No newline at end of file diff --git a/terraform/modules.tf b/terraform/modules.tf index badf52a..4b64d1a 100644 --- a/terraform/modules.tf +++ b/terraform/modules.tf @@ -224,3 +224,9 @@ module "email_receipt_confirmation" { lambda_storage_bucket = aws_s3_bucket.lambda_storage.id aws_region = var.aws_region } + +module "rich_pdf_ingestion" { + source = "../lambdas/rich_pdf_ingestion" + lambda_storage_bucket = aws_s3_bucket.lambda_storage.id + aws_region = var.aws_region +}