diff --git a/lambdas/bedrock_invoker/lambda.tf b/lambdas/bedrock_invoker/lambda.tf new file mode 100644 index 0000000..3fc2f1f --- /dev/null +++ b/lambdas/bedrock_invoker/lambda.tf @@ -0,0 +1,74 @@ +locals { + lambda_function_name = "levio-esta-bedrock-invoker" + timeout = 30 + runtime = "python3.11" + powertools_layer_arn = "arn:aws:lambda:${var.aws_region}:017000801446:layer:AWSLambdaPowertoolsPythonV2:67" +} + +data "aws_caller_identity" "current" {} + + +module "lambda_function_container_image" { + source = "terraform-aws-modules/lambda/aws" + function_name = local.lambda_function_name + handler = "index.lambda_handler" + publish = true + runtime = local.runtime + timeout = local.timeout + layers = [local.powertools_layer_arn] + source_path = "${path.module}/src" + s3_bucket = var.lambda_storage_bucket + memory_size = 256 + role_name = "${local.lambda_function_name}-role" + attach_policy_statements = true + + policy_statements = { + log_group = { + effect = "Allow" + actions = [ + "logs:CreateLogGroup" + ] + resources = [ + "arn:aws:logs:*:*:*" + ] + } + + bedrock_invoke = { + effect = "Allow" + actions = [ + "bedrock:InvokeModel" + ] + resources = [ + "arn:aws:bedrock:*:${data.aws_caller_identity.current.account_id}:model/*" + ] + } + + s3 = { + effect = "Allow" + actions = [ + "s3:Get*", + "s3:List*", + "s3:Describe*", + "s3:PutObject", + "s3-object-lambda:Get*", + "s3-object-lambda:List*", + "s3-object-lambda:WriteGetObjectResponse" + ] + resources = var.allowed_s3_resources + } + + log_write = { + effect = "Allow" + + resources = [ + "arn:aws:logs:*:*:log-group:/aws/${local.lambda_function_name}/*:*" + ] + + actions = [ + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + } + + } +} diff --git a/lambdas/bedrock_invoker/output.tf b/lambdas/bedrock_invoker/output.tf new file mode 100644 index 0000000..cd2e3af --- /dev/null +++ b/lambdas/bedrock_invoker/output.tf @@ -0,0 +1,7 @@ +output "lambda_function_arn" { + value = module.lambda_function_container_image.lambda_function_arn +} + +output "lambda_function_name" { + value = module.lambda_function_container_image.lambda_function_name +} diff --git a/lambdas/bedrock_invoker/src/index.py b/lambdas/bedrock_invoker/src/index.py new file mode 100644 index 0000000..3f35876 --- /dev/null +++ b/lambdas/bedrock_invoker/src/index.py @@ -0,0 +1,48 @@ +import boto3 +import json +from botocore.exceptions import BotoCoreError, ClientError + +s3 = boto3.client('s3') +bedrock = boto3.client('bedrock') + +def lambda_handler(event, context): + s3_arn = event['s3_arn'] + bedrock_params = event['bedrock_params'] + prompt = event['prompt'] + + # Parse the S3 ARN to get the bucket and key + bucket, key = s3_arn.split(':::')[1].split('/') + + # Download the file from S3 + try: + s3_object = s3.get_object(Bucket=bucket, Key=key) + except ClientError as e: + return { + 'statusCode': 400, + 'body': str(e) + } + + # Extract text from the S3 object + extracted_text = s3_object['Body'].read().decode('utf-8') + + # Invoke the Bedrock model with the extracted text and the provided parameters + try: + response = bedrock.invoke_model( + ModelName=bedrock_params['model_name'], + Payload=json.dumps({ + 'master': bedrock_params['master'], + 'prompt': prompt, + 'message': extracted_text + }) + ) + except BotoCoreError as e: + return { + 'statusCode': 400, + 'body': str(e) + } + + return { + 'statusCode': 200, + 'body': 'Successfully processed the S3 ARN', + 'bedrockResponse': response + } \ No newline at end of file diff --git a/lambdas/bedrock_invoker/variables.tf b/lambdas/bedrock_invoker/variables.tf new file mode 100644 index 0000000..45ce8a5 --- /dev/null +++ b/lambdas/bedrock_invoker/variables.tf @@ -0,0 +1,15 @@ +variable "lambda_storage_bucket" { + type = string + nullable = false +} + +variable "aws_region" { + type = string + nullable = false +} + +variable "allowed_s3_resources" { + type = list(string) + nullable = false + description = "values for the s3 resources that the lambda function can access" +} \ No newline at end of file diff --git a/lambdas/rich_pdf_ingestion/src/index.py b/lambdas/rich_pdf_ingestion/src/index.py index be51160..d5ff31b 100644 --- a/lambdas/rich_pdf_ingestion/src/index.py +++ b/lambdas/rich_pdf_ingestion/src/index.py @@ -10,6 +10,54 @@ s3 = boto3.client("s3") +def lambda_handler(event, context): + """ + Etract text/tables from a PDF and store in a s3 object + """ + print(event) + attachment_s3_arn = event['path'] + + try: + attachment_s3_info = parse_s3_arn(attachment_s3_arn) + print("Attachment s3 arn parsed info: ", attachment_s3_info) + bucket = attachment_s3_info["bucket"] + folder = attachment_s3_info['folder'] + key = attachment_s3_info["key"] + filename_without_extension = attachment_s3_info['filename_without_extension'] + extracted_files_s3_arns = [] + + if os.path.splitext(key)[1][1:] == "pdf": + local_filename = fetch_file(bucket, key) + print("Extracting text from pdf") + extracted_text = extract_text_from_pdf(local_filename) + extracted_text_local_file = store_extracted_text_in_local_file( + extracted_text) + print("Finished extracting text from pdf") + extracted_text_s3_key = "/".join( + [folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"]) + print("Uploading file to ", extracted_text_s3_key) + upload_file( + file_to_upload=extracted_text_local_file, + bucket=bucket, + key=extracted_text_s3_key + ) + extracted_files_s3_arns.append( + f"arn:aws:s3:::{bucket}/{extracted_text_s3_key}") + + return { + 'statusCode': 200, + 'body': 'PDF text content extracted and saved', + 'attachment_arns': extracted_files_s3_arns + } + + except Exception as e: + print(e) + return { + 'statusCode': 400, + 'body': e + } + + def extract_text_from_pdf(pdf_file_path): text = "" @@ -67,48 +115,3 @@ def store_extracted_text_in_local_file(extracted_text): f.write(extracted_text) return local_file_path - - -def lambda_handler(event, context): - print(event) - attachment_s3_arn = event['path'] - - try: - attachment_s3_info = parse_s3_arn(attachment_s3_arn) - print("Attachment s3 arn parsed info: ", attachment_s3_info) - bucket = attachment_s3_info["bucket"] - folder = attachment_s3_info['folder'] - key = attachment_s3_info["key"] - filename_without_extension = attachment_s3_info['filename_without_extension'] - extracted_files_s3_arns = [] - - if os.path.splitext(key)[1][1:] == "pdf": - local_filename = fetch_file(bucket, key) - print("Extracting text from pdf") - extracted_text = extract_text_from_pdf(local_filename) - extracted_text_local_file = store_extracted_text_in_local_file( - extracted_text) - print("Finished extracting text from pdf") - extracted_text_s3_key = "/".join( - [folder, filename_without_extension+"_extracted_pdf_content", str(uuid.uuid4())+".txt"]) - print("Uploading file to ", extracted_text_s3_key) - upload_file( - file_to_upload=extracted_text_local_file, - bucket=bucket, - key=extracted_text_s3_key - ) - extracted_files_s3_arns.append( - f"arn:aws:s3:::{bucket}/{extracted_text_s3_key}") - - return { - 'statusCode': 200, - 'body': 'PDF text content extracted and saved', - 'attachment_arns': extracted_files_s3_arns - } - - except Exception as e: - print(e) - return { - 'statusCode': 400, - 'body': e - } diff --git a/terraform/modules.tf b/terraform/modules.tf index 5dd335c..d4a7c3b 100644 --- a/terraform/modules.tf +++ b/terraform/modules.tf @@ -267,3 +267,12 @@ module "email_attachment_saver" { allowed_s3_resources = [module.s3_bucket.s3_bucket_arn, "${module.s3_bucket.s3_bucket_arn}/*"] } + +module "bedrock_invoker" { + source = "../lambdas/berock_invoker" + lambda_storage_bucket = aws_s3_bucket.lambda_storage.id + aws_region = var.aws_region + allowed_s3_resources = [module.s3_bucket.s3_bucket_arn, "${module.s3_bucket.s3_bucket_arn}/*"] +} + +