From b24516433a05c007a5d01b5b733d39095550f288 Mon Sep 17 00:00:00 2001 From: kdziedzic68 Date: Wed, 30 Oct 2024 12:44:53 +0100 Subject: [PATCH] feat(document-search) add location metadata to document (#122) --- .../document_search/documents/element.py | 10 ++++++++ .../ingestion/providers/unstructured/utils.py | 23 ++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py index 64c546dab..fcfad9239 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py @@ -7,6 +7,15 @@ from ragbits.document_search.documents.document import DocumentMeta +class ElementLocation(BaseModel): + """ + An object representing position of chunk within document. + """ + + page_number: int | None = None + coordinates: dict | None = None + + class Element(BaseModel, ABC): """ An object representing an element in a document. @@ -14,6 +23,7 @@ class Element(BaseModel, ABC): element_type: str document_meta: DocumentMeta + location: ElementLocation | None = None _elements_registry: ClassVar[dict[str, type["Element"]]] = {} diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/utils.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/utils.py index f604e0084..cdd00b215 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/utils.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/utils.py @@ -8,7 +8,7 @@ from ragbits.core.llms.base import LLM from ragbits.core.prompt.base import BasePrompt from ragbits.document_search.documents.document import DocumentMeta -from ragbits.document_search.documents.element import TextElement +from ragbits.document_search.documents.element import ElementLocation, TextElement def to_text_element(element: UnstructuredElement, document_meta: DocumentMeta) -> TextElement: @@ -22,9 +22,30 @@ def to_text_element(element: UnstructuredElement, document_meta: DocumentMeta) - Returns: text element """ + location = to_element_location(element) return TextElement( document_meta=document_meta, content=element.text, + location=location, + ) + + +def to_element_location(element: UnstructuredElement) -> ElementLocation: + """ + Converts unstructured element to element location. + + Args: + element: element from unstructured + + Returns: + element location + """ + metadata = element.metadata.to_dict() + page_number = metadata.get("page_number") + coordinates = metadata.get("coordinates") + return ElementLocation( + page_number=page_number, + coordinates=coordinates, )