From 7c929a64984499cc0780062050fce333a62f0b3e Mon Sep 17 00:00:00 2001 From: kdziedzic Date: Mon, 21 Oct 2024 16:18:38 +0200 Subject: [PATCH] add location metadata to document --- .../document_search/documents/document.py | 23 +++++++++++++++++++ .../ingestion/providers/unstructured.py | 7 ++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py b/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py index 581f33006..3b87d4fdb 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py @@ -32,6 +32,15 @@ class DocumentType(str, Enum): UNKNOWN = "unknown" +class DocumentLocation(BaseModel): + """ + An object representing position of chunk within document + """ + + page_number: int | None = None + coordinates: dict | None = None + + class DocumentMeta(BaseModel): """ An object representing a document metadata. @@ -39,6 +48,7 @@ class DocumentMeta(BaseModel): document_type: DocumentType source: LocalFileSource | GCSSource | HuggingFaceSource = Field(..., discriminator="source_type") + location: DocumentLocation | None = None @property def id(self) -> str: @@ -50,6 +60,19 @@ def id(self) -> str: """ return self.source.id + def add_location_metadata(self, provider_metadata: dict | None) -> None: + """ + Add metadata retrived by provider to document metadata. + + Args: + provider_metadata: metadata retrived by provider or null. + + """ + if provider_metadata: + page_number = provider_metadata.get("page_number", None) + coordinates = provider_metadata.get("coordinates", None) + self.location = DocumentLocation(page_number=page_number, coordinates=coordinates) + async def fetch(self) -> "Document": """ This method fetches the document from source (potentially remote) and creates an object to interface with it. diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py index 116d38645..679fe18b1 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py @@ -1,4 +1,5 @@ import os +from copy import deepcopy from io import BytesIO from typing import Optional @@ -131,14 +132,16 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]: metadata_filename=document.local_path.name, **self.partition_kwargs, ) - elements = chunk_elements(elements, **self.chunking_kwargs) return [_to_text_element(element, document_meta) for element in elements] def _to_text_element(element: UnstructuredElement, document_meta: DocumentMeta) -> TextElement: + _document_meta = deepcopy(document_meta) + if element.metadata: + _document_meta.add_location_metadata(provider_metadata=element.metadata.to_dict()) return TextElement( - document_meta=document_meta, + document_meta=_document_meta, content=element.text, )