Skip to content

Commit

Permalink
Merge pull request #27 from FloRul/optimization/diminuer-contexte
Browse files Browse the repository at this point in the history
Optimization/diminuer contexte
  • Loading branch information
FloRul authored Feb 16, 2024
2 parents e245246 + 5a7ee3d commit 584b092
Show file tree
Hide file tree
Showing 8 changed files with 294 additions and 37 deletions.
6 changes: 3 additions & 3 deletions lambdas/inference/lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ module "lambda_function_container_image" {
ENABLE_RETRIEVAL = 1
MEMORY_LAMBDA_NAME = var.memory_lambda_name
DYNAMO_TABLE = var.dynamo_history_table_name
TOP_K = 50
TEMPERATURE = 0.5
TOP_K = 10
TEMPERATURE = 0.1
TOP_P = 0.99
RELEVANCE_THRESHOLD = 0.65
RELEVANCE_THRESHOLD = 0.67
MODEL_ID = "anthropic.claude-instant-v1"
EMBEDDING_COLLECTION_NAME = var.embedding_collection_name
SYSTEM_PROMPT = "Answer in four to five sentences maximum.Answer in french."
Expand Down
7 changes: 4 additions & 3 deletions lambdas/ingestion/lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ data "aws_ecr_image" "lambda_image" {
}

module "lambda_function_container_image" {
timeout = 120
timeout = 900
source = "terraform-aws-modules/lambda/aws"
function_name = var.lambda_function_name
create_package = false
image_uri = data.aws_ecr_image.lambda_image.image_uri
package_type = "Image"
memory_size = 256
memory_size = 2048
vpc_subnet_ids = var.lambda_vpc_subnet_ids
vpc_security_group_ids = var.lambda_vpc_security_group_ids
role_name = "${var.lambda_function_name}-role"
Expand All @@ -23,7 +23,8 @@ module "lambda_function_container_image" {
PGVECTOR_DATABASE = var.pg_vector_database
PGVECTOR_USER = var.pg_vector_user
PGVECTOR_PASSWORD_SECRET_NAME = var.pg_vector_password_secret_name
USE_TEXTRACT = 0
CHUNK_SIZE = 256
CHUNK_OVERLAP = 20
}
policy_statements = {
log_group = {
Expand Down
39 changes: 11 additions & 28 deletions lambdas/ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def get_secret():
PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))
PGVECTOR_DATABASE = os.environ.get("PGVECTOR_DATABASE", "postgres")
PGVECTOR_USER = os.environ.get("PGVECTOR_USER", "postgres")
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", 256))
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", 20))
PGVECTOR_PASSWORD = get_secret()


Expand Down Expand Up @@ -98,34 +100,15 @@ def get_vector_store(collection_name="main_collection"):
def extract_pdf_content(file_path, file_name):
print(f"Extracting content from {file_name}")

use_textract = int(os.getenv("USE_TEXTRACT", "0"))

if use_textract == 1:
textract_client = boto3.client("textract")
with open(file_path, "rb") as file:
response = textract_client.detect_document_text(
Document={"Bytes": file.read()}
)
text = " ".join(
[item["Text"] for item in response["Blocks"] if item["BlockType"] == "LINE"]
)
created_at = datetime.datetime.now().isoformat()
doc = Document(
text=text, metadata={"source": file_name, "created_at": created_at, "textract": True}
)
return [doc]
else:
loader = PyPDFLoader(file_path)
docs = loader.load_and_split(
text_splitter=RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=50
)
)
created_at = datetime.datetime.now().isoformat()
for doc in docs:
doc.metadata["source"] = file_name
doc.metadata["created_at"] = created_at
return docs
loader = PyPDFLoader(file_path)
docs = loader.load_and_split(
text_splitter=RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
)
created_at = datetime.datetime.now().isoformat()
for doc in docs:
doc.metadata["source"] = file_name
doc.metadata["created_at"] = created_at
return docs


OBJECT_CREATED = "ObjectCreated"
Expand Down
4 changes: 1 addition & 3 deletions lambdas/ingestion/src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,4 @@ langchain-community
langchain
psycopg2-binary
pgvector
boto3>=1.28.85
pypdf
tabula-py
boto3
142 changes: 142 additions & 0 deletions lambdas/textract_cracking/lambda.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
data "aws_ecr_image" "lambda_image" {
repository_name = var.lambda_repository_name
most_recent = true
}

resource "aws_sns_topic" "ingestion_job_sns_topic" {
name = "${var.lambda_function_name}-topic"
}

module "lambda_function_container_image" {
timeout = 900
source = "terraform-aws-modules/lambda/aws"
function_name = var.lambda_function_name
create_package = false
image_uri = data.aws_ecr_image.lambda_image.image_uri
package_type = "Image"
memory_size = 2048
vpc_subnet_ids = var.lambda_vpc_subnet_ids
vpc_security_group_ids = var.lambda_vpc_security_group_ids
role_name = "${var.lambda_function_name}-role"
attach_policy_statements = true

environment_variables = {
PGVECTOR_DRIVER = "psycopg2"
PGVECTOR_HOST = var.pg_vector_host
PGVECTOR_PORT = var.pg_vector_port
PGVECTOR_DATABASE = var.pg_vector_database
PGVECTOR_USER = var.pg_vector_user
PGVECTOR_PASSWORD_SECRET_NAME = var.pg_vector_password_secret_name
USE_TEXTRACT = 0
}
policy_statements = {
log_group = {
effect = "Allow"
actions = [
"logs:CreateLogGroup"
]
resources = [
"arn:aws:logs:*:*:*"
]
}
log_write = {
effect = "Allow"

resources = [
"arn:aws:logs:*:*:log-group:/aws/${var.lambda_function_name}/*:*"
]

actions = [
"logs:CreateLogStream",
"logs:PutLogEvents",
]
}
bedrock_usage = {
effect = "Allow"

resources = [
"*"
]

actions = [
"bedrock:*"
]
}
rds_connect_readwrite = {
effect = "Allow"

resources = [
"arn:aws:rds:${var.aws_region}:446872271111:db:${var.pg_vector_database}"
]

actions = [
"rds-db:connect",
"rds-db:execute-statement",
"rds-db:rollback-transaction",
"rds-db:commit-transaction",
"rds-db:beginTransaction"
]
}
access_network_interface = {
effect = "Allow"

resources = [
"*"
]

actions = [
"ec2:CreateNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DeleteNetworkInterface"
]
}
secretsmanager = {
effect = "Allow"

resources = [
var.secret_arn
]

actions = [
"secretsmanager:GetSecretValue"
]
}
sqs = {
effect = "Allow"

resources = [
aws_sqs_queue.queue.arn
]

actions = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes",
"sqs:ChangeMessageVisibility"
]
}
s3 = {
effect = "Allow"

resources = [
aws_s3_bucket.ingestion_source_storage.arn,
"${aws_s3_bucket.ingestion_source_storage.arn}/*"
]

actions = [
"s3:*"
]
}
textract = {
effect = "Allow"

resources = [
"*"
]

actions = [
"textract:*"
]
}
}
}
51 changes: 51 additions & 0 deletions lambdas/textract_cracking/src/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig

OBJECT_CREATED = "ObjectCreated"
OBJECT_REMOVED = "ObjectRemoved"


def start_textract_job(bucket, key):
extractor = Textractor()
job = extractor.start_document_analysis(
file_source=f"s3://{bucket}/{key}",
features=[TextractFeatures.LAYOUT],
save_image=False,
)
config = TextLinearizationConfig(
hide_figure_layout=True,
header_prefix="<header>",
header_suffix="</header>",
title_prefix="<title>",
title_suffix="</title>",
section_header_prefix="<section_header>",
section_header_suffix="</section_header>",
add_prefixes_and_suffixes_as_words=True,
add_prefixes_and_suffixes_in_text=True,
)
linearized_text = job.document.get_text(config=config)
return linearized_text



def get_bucket_and_key(record):
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
return bucket, key


def lambda_handler(event, context):
print(event)
records = json.loads(event["Records"][0]["body"])["Records"]
for record in records:
eventName = record["eventName"]
print(f"eventName: {eventName}")
try:
bucket, key = get_bucket_and_key(record)
print(f"source_bucket: {bucket}, source_key: {key}")

except Exception as e:
print(e)
raise e
6 changes: 6 additions & 0 deletions lambdas/textract_cracking/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
unstructured
psycopg2-binary
pgvector
boto3
amazon-textract-textractor
amazon-textract-textractor[pdf]
76 changes: 76 additions & 0 deletions lambdas/textract_cracking/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
variable "storage_bucket_name" {
type = string
nullable = false
}

variable "textract_output_bucket_name" {
type = string
nullable = false
}

variable "lambda_function_name" {
nullable = false
type = string
}

variable "lambda_vpc_security_group_ids" {
type = list(string)
nullable = false
}

variable "lambda_vpc_subnet_ids" {
type = list(string)
nullable = false
}

variable "pg_vector_host" {
type = string
nullable = false
}

variable "pg_vector_port" {
type = number
nullable = false
}

variable "pg_vector_database" {
type = string
nullable = false
}

variable "pg_vector_user" {
type = string
nullable = false
}

variable "pg_vector_driver" {
type = string
nullable = false
default = "psycopg2"
}

variable "pg_vector_password_secret_name" {
type = string
nullable = false
}

variable "secret_arn" {
type = string
nullable = false
}

variable "lambda_repository_name" {
nullable = false
}

variable "aws_region" {
type = string
default = "us-east-1"
}

variable "queue_name" {
description = "The name of the SQS queue that dispatch the files to the Lambda function"
type = string
nullable = false
}

0 comments on commit 584b092

Please sign in to comment.