Skip to content

Commit

Permalink
Refactor PDF content extraction logic
Browse files Browse the repository at this point in the history
  • Loading branch information
FloRul committed Feb 15, 2024
1 parent c3ba509 commit c1955b0
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 31 deletions.
37 changes: 9 additions & 28 deletions lambdas/ingestion/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,34 +98,15 @@ def get_vector_store(collection_name="main_collection"):
def extract_pdf_content(file_path, file_name):
print(f"Extracting content from {file_name}")

use_textract = int(os.getenv("USE_TEXTRACT", "0"))

if use_textract == 1:
textract_client = boto3.client("textract")
with open(file_path, "rb") as file:
response = textract_client.detect_document_text(
Document={"Bytes": file.read()}
)
text = " ".join(
[item["Text"] for item in response["Blocks"] if item["BlockType"] == "LINE"]
)
created_at = datetime.datetime.now().isoformat()
doc = Document(
text=text, metadata={"source": file_name, "created_at": created_at, "textract": True}
)
return [doc]
else:
loader = PyPDFLoader(file_path)
docs = loader.load_and_split(
text_splitter=RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=50
)
)
created_at = datetime.datetime.now().isoformat()
for doc in docs:
doc.metadata["source"] = file_name
doc.metadata["created_at"] = created_at
return docs
loader = PyPDFLoader(file_path)
docs = loader.load_and_split(
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
)
created_at = datetime.datetime.now().isoformat()
for doc in docs:
doc.metadata["source"] = file_name
doc.metadata["created_at"] = created_at
return docs


OBJECT_CREATED = "ObjectCreated"
Expand Down
4 changes: 1 addition & 3 deletions lambdas/ingestion/src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,4 @@ langchain-community
langchain
psycopg2-binary
pgvector
boto3
amazon-textract-textractor
amazon-textract-textractor[pdf]
boto3
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ data "aws_ecr_image" "lambda_image" {
most_recent = true
}

resource "aws_sns_topic" "ingestion_job_sns_topic" {
name = "${var.lambda_function_name}-topic"
}

module "lambda_function_container_image" {
timeout = 900
source = "terraform-aws-modules/lambda/aws"
Expand Down
51 changes: 51 additions & 0 deletions lambdas/textract_cracking/src/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig

OBJECT_CREATED = "ObjectCreated"
OBJECT_REMOVED = "ObjectRemoved"


def start_textract_job(bucket, key):
extractor = Textractor()
job = extractor.start_document_analysis(
file_source=f"s3://{bucket}/{key}",
features=[TextractFeatures.LAYOUT],
save_image=False,
)
config = TextLinearizationConfig(
hide_figure_layout=True,
header_prefix="<header>",
header_suffix="</header>",
title_prefix="<title>",
title_suffix="</title>",
section_header_prefix="<section_header>",
section_header_suffix="</section_header>",
add_prefixes_and_suffixes_as_words=True,
add_prefixes_and_suffixes_in_text=True,
)
linearized_text = job.document.get_text(config=config)
return linearized_text



def get_bucket_and_key(record):
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
return bucket, key


def lambda_handler(event, context):
print(event)
records = json.loads(event["Records"][0]["body"])["Records"]
for record in records:
eventName = record["eventName"]
print(f"eventName: {eventName}")
try:
bucket, key = get_bucket_and_key(record)
print(f"source_bucket: {bucket}, source_key: {key}")

except Exception as e:
print(e)
raise e
File renamed without changes.
Empty file.

0 comments on commit c1955b0

Please sign in to comment.