Merge pull request #27 from FloRul/optimization/diminuer-contexte

Optimization/diminuer contexte
FloRul · Feb 16, 2024 · 584b092 · 584b092
2 parents e245246 + 5a7ee3d
commit 584b092
Show file tree

Hide file tree

Showing 8 changed files with 294 additions and 37 deletions.
diff --git a/lambdas/inference/lambda.tf b/lambdas/inference/lambda.tf
@@ -29,10 +29,10 @@ module "lambda_function_container_image" {
     ENABLE_RETRIEVAL              = 1
     MEMORY_LAMBDA_NAME            = var.memory_lambda_name
     DYNAMO_TABLE                  = var.dynamo_history_table_name
-    TOP_K                         = 50
-    TEMPERATURE                   = 0.5
+    TOP_K                         = 10
+    TEMPERATURE                   = 0.1
     TOP_P                         = 0.99
-    RELEVANCE_THRESHOLD           = 0.65
+    RELEVANCE_THRESHOLD           = 0.67
     MODEL_ID                      = "anthropic.claude-instant-v1"
     EMBEDDING_COLLECTION_NAME     = var.embedding_collection_name
     SYSTEM_PROMPT                 = "Answer in four to five sentences maximum.Answer in french."

diff --git a/lambdas/ingestion/lambda.tf b/lambdas/ingestion/lambda.tf
@@ -4,13 +4,13 @@ data "aws_ecr_image" "lambda_image" {
 }
 
 module "lambda_function_container_image" {
-  timeout                  = 120
+  timeout                  = 900
   source                   = "terraform-aws-modules/lambda/aws"
   function_name            = var.lambda_function_name
   create_package           = false
   image_uri                = data.aws_ecr_image.lambda_image.image_uri
   package_type             = "Image"
-  memory_size              = 256
+  memory_size              = 2048
   vpc_subnet_ids           = var.lambda_vpc_subnet_ids
   vpc_security_group_ids   = var.lambda_vpc_security_group_ids
   role_name                = "${var.lambda_function_name}-role"
@@ -23,7 +23,8 @@ module "lambda_function_container_image" {
     PGVECTOR_DATABASE             = var.pg_vector_database
     PGVECTOR_USER                 = var.pg_vector_user
     PGVECTOR_PASSWORD_SECRET_NAME = var.pg_vector_password_secret_name
-    USE_TEXTRACT                  = 0
+    CHUNK_SIZE                    = 256
+    CHUNK_OVERLAP                 = 20
   }
   policy_statements = {
     log_group = {

diff --git a/lambdas/ingestion/src/index.py b/lambdas/ingestion/src/index.py
@@ -31,6 +31,8 @@ def get_secret():
 PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))
 PGVECTOR_DATABASE = os.environ.get("PGVECTOR_DATABASE", "postgres")
 PGVECTOR_USER = os.environ.get("PGVECTOR_USER", "postgres")
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", 256))
+CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", 20))
 PGVECTOR_PASSWORD = get_secret()
 
 
@@ -98,34 +100,15 @@ def get_vector_store(collection_name="main_collection"):
 def extract_pdf_content(file_path, file_name):
     print(f"Extracting content from {file_name}")
 
-    use_textract = int(os.getenv("USE_TEXTRACT", "0"))
-
-    if use_textract == 1:
-        textract_client = boto3.client("textract")
-        with open(file_path, "rb") as file:
-            response = textract_client.detect_document_text(
-                Document={"Bytes": file.read()}
-            )
-        text = " ".join(
-            [item["Text"] for item in response["Blocks"] if item["BlockType"] == "LINE"]
-        )
-        created_at = datetime.datetime.now().isoformat()
-        doc = Document(
-            text=text, metadata={"source": file_name, "created_at": created_at, "textract": True}
-        )
-        return [doc]
-    else:
-        loader = PyPDFLoader(file_path)
-        docs = loader.load_and_split(
-            text_splitter=RecursiveCharacterTextSplitter(
-                chunk_size=1000, chunk_overlap=50
-            )
-        )
-        created_at = datetime.datetime.now().isoformat()
-        for doc in docs:
-            doc.metadata["source"] = file_name
-            doc.metadata["created_at"] = created_at
-        return docs
+    loader = PyPDFLoader(file_path)
+    docs = loader.load_and_split(
+        text_splitter=RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    )
+    created_at = datetime.datetime.now().isoformat()
+    for doc in docs:
+        doc.metadata["source"] = file_name
+        doc.metadata["created_at"] = created_at
+    return docs
 
 
 OBJECT_CREATED = "ObjectCreated"

diff --git a/lambdas/ingestion/src/requirements.txt b/lambdas/ingestion/src/requirements.txt
@@ -3,6 +3,4 @@ langchain-community
 langchain
 psycopg2-binary
 pgvector
-boto3>=1.28.85
-pypdf
-tabula-py
+boto3
diff --git a/lambdas/textract_cracking/lambda.tf b/lambdas/textract_cracking/lambda.tf
@@ -0,0 +1,142 @@
+data "aws_ecr_image" "lambda_image" {
+  repository_name = var.lambda_repository_name
+  most_recent     = true
+}
+
+resource "aws_sns_topic" "ingestion_job_sns_topic" {
+  name = "${var.lambda_function_name}-topic"
+}
+
+module "lambda_function_container_image" {
+  timeout                  = 900
+  source                   = "terraform-aws-modules/lambda/aws"
+  function_name            = var.lambda_function_name
+  create_package           = false
+  image_uri                = data.aws_ecr_image.lambda_image.image_uri
+  package_type             = "Image"
+  memory_size              = 2048
+  vpc_subnet_ids           = var.lambda_vpc_subnet_ids
+  vpc_security_group_ids   = var.lambda_vpc_security_group_ids
+  role_name                = "${var.lambda_function_name}-role"
+  attach_policy_statements = true
+
+  environment_variables = {
+    PGVECTOR_DRIVER               = "psycopg2"
+    PGVECTOR_HOST                 = var.pg_vector_host
+    PGVECTOR_PORT                 = var.pg_vector_port
+    PGVECTOR_DATABASE             = var.pg_vector_database
+    PGVECTOR_USER                 = var.pg_vector_user
+    PGVECTOR_PASSWORD_SECRET_NAME = var.pg_vector_password_secret_name
+    USE_TEXTRACT                  = 0
+  }
+  policy_statements = {
+    log_group = {
+      effect = "Allow"
+      actions = [
+        "logs:CreateLogGroup"
+      ]
+      resources = [
+        "arn:aws:logs:*:*:*"
+      ]
+    }
+    log_write = {
+      effect = "Allow"
+
+      resources = [
+        "arn:aws:logs:*:*:log-group:/aws/${var.lambda_function_name}/*:*"
+      ]
+
+      actions = [
+        "logs:CreateLogStream",
+        "logs:PutLogEvents",
+      ]
+    }
+    bedrock_usage = {
+      effect = "Allow"
+
+      resources = [
+        "*"
+      ]
+
+      actions = [
+        "bedrock:*"
+      ]
+    }
+    rds_connect_readwrite = {
+      effect = "Allow"
+
+      resources = [
+        "arn:aws:rds:${var.aws_region}:446872271111:db:${var.pg_vector_database}"
+      ]
+
+      actions = [
+        "rds-db:connect",
+        "rds-db:execute-statement",
+        "rds-db:rollback-transaction",
+        "rds-db:commit-transaction",
+        "rds-db:beginTransaction"
+      ]
+    }
+    access_network_interface = {
+      effect = "Allow"
+
+      resources = [
+        "*"
+      ]
+
+      actions = [
+        "ec2:CreateNetworkInterface",
+        "ec2:DescribeNetworkInterfaces",
+        "ec2:DeleteNetworkInterface"
+      ]
+    }
+    secretsmanager = {
+      effect = "Allow"
+
+      resources = [
+        var.secret_arn
+      ]
+
+      actions = [
+        "secretsmanager:GetSecretValue"
+      ]
+    }
+    sqs = {
+      effect = "Allow"
+
+      resources = [
+        aws_sqs_queue.queue.arn
+      ]
+
+      actions = [
+        "sqs:ReceiveMessage",
+        "sqs:DeleteMessage",
+        "sqs:GetQueueAttributes",
+        "sqs:ChangeMessageVisibility"
+      ]
+    }
+    s3 = {
+      effect = "Allow"
+
+      resources = [
+        aws_s3_bucket.ingestion_source_storage.arn,
+        "${aws_s3_bucket.ingestion_source_storage.arn}/*"
+      ]
+
+      actions = [
+        "s3:*"
+      ]
+    }
+    textract = {
+      effect = "Allow"
+
+      resources = [
+        "*"
+      ]
+
+      actions = [
+        "textract:*"
+      ]
+    }
+  }
+}
diff --git a/lambdas/textract_cracking/src/index.py b/lambdas/textract_cracking/src/index.py
@@ -0,0 +1,51 @@
+import json
+from textractor import Textractor
+from textractor.data.constants import TextractFeatures
+from textractor.data.text_linearization_config import TextLinearizationConfig
+
+OBJECT_CREATED = "ObjectCreated"
+OBJECT_REMOVED = "ObjectRemoved"
+
+
+def start_textract_job(bucket, key):
+    extractor = Textractor()
+    job = extractor.start_document_analysis(
+        file_source=f"s3://{bucket}/{key}",
+        features=[TextractFeatures.LAYOUT],
+        save_image=False,
+    )
+    config = TextLinearizationConfig(
+        hide_figure_layout=True,
+        header_prefix="<header>",
+        header_suffix="</header>",
+        title_prefix="<title>",
+        title_suffix="</title>",
+        section_header_prefix="<section_header>",
+        section_header_suffix="</section_header>",
+        add_prefixes_and_suffixes_as_words=True,
+        add_prefixes_and_suffixes_in_text=True,
+    )
+    linearized_text = job.document.get_text(config=config)
+    return linearized_text
+
+
+
+def get_bucket_and_key(record):
+    bucket = record["s3"]["bucket"]["name"]
+    key = record["s3"]["object"]["key"]
+    return bucket, key
+
+
+def lambda_handler(event, context):
+    print(event)
+    records = json.loads(event["Records"][0]["body"])["Records"]
+    for record in records:
+        eventName = record["eventName"]
+        print(f"eventName: {eventName}")
+        try:
+            bucket, key = get_bucket_and_key(record)
+            print(f"source_bucket: {bucket}, source_key: {key}")
+
+        except Exception as e:
+            print(e)
+            raise e
diff --git a/lambdas/textract_cracking/src/requirements.txt b/lambdas/textract_cracking/src/requirements.txt
@@ -0,0 +1,6 @@
+unstructured
+psycopg2-binary
+pgvector
+boto3
+amazon-textract-textractor
+amazon-textract-textractor[pdf]
diff --git a/lambdas/textract_cracking/variables.tf b/lambdas/textract_cracking/variables.tf
@@ -0,0 +1,76 @@
+variable "storage_bucket_name" {
+  type     = string
+  nullable = false
+}
+
+variable "textract_output_bucket_name" {
+  type     = string
+  nullable = false
+}
+
+variable "lambda_function_name" {
+  nullable = false
+  type     = string
+}
+
+variable "lambda_vpc_security_group_ids" {
+  type     = list(string)
+  nullable = false
+}
+
+variable "lambda_vpc_subnet_ids" {
+  type     = list(string)
+  nullable = false
+}
+
+variable "pg_vector_host" {
+  type     = string
+  nullable = false
+}
+
+variable "pg_vector_port" {
+  type     = number
+  nullable = false
+}
+
+variable "pg_vector_database" {
+  type     = string
+  nullable = false
+}
+
+variable "pg_vector_user" {
+  type     = string
+  nullable = false
+}
+
+variable "pg_vector_driver" {
+  type     = string
+  nullable = false
+  default  = "psycopg2"
+}
+
+variable "pg_vector_password_secret_name" {
+  type     = string
+  nullable = false
+}
+
+variable "secret_arn" {
+  type     = string
+  nullable = false
+}
+
+variable "lambda_repository_name" {
+  nullable = false
+}
+
+variable "aws_region" {
+  type    = string
+  default = "us-east-1"
+}
+
+variable "queue_name" {
+  description = "The name of the SQS queue that dispatch the files to the Lambda function"
+  type        = string
+  nullable    = false
+}
+