Skip to content

Commit

Permalink
add location metadata to document
Browse files Browse the repository at this point in the history
  • Loading branch information
kdziedzic68 committed Oct 21, 2024
1 parent e77e234 commit 7c929a6
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,23 @@ class DocumentType(str, Enum):
UNKNOWN = "unknown"


class DocumentLocation(BaseModel):
"""
An object representing position of chunk within document
"""

page_number: int | None = None
coordinates: dict | None = None


class DocumentMeta(BaseModel):
"""
An object representing a document metadata.
"""

document_type: DocumentType
source: LocalFileSource | GCSSource | HuggingFaceSource = Field(..., discriminator="source_type")
location: DocumentLocation | None = None

@property
def id(self) -> str:
Expand All @@ -50,6 +60,19 @@ def id(self) -> str:
"""
return self.source.id

def add_location_metadata(self, provider_metadata: dict | None) -> None:
"""
Add metadata retrived by provider to document metadata.
Args:
provider_metadata: metadata retrived by provider or null.
"""
if provider_metadata:
page_number = provider_metadata.get("page_number", None)
coordinates = provider_metadata.get("coordinates", None)
self.location = DocumentLocation(page_number=page_number, coordinates=coordinates)

async def fetch(self) -> "Document":
"""
This method fetches the document from source (potentially remote) and creates an object to interface with it.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from copy import deepcopy
from io import BytesIO
from typing import Optional

Expand Down Expand Up @@ -131,14 +132,16 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]:
metadata_filename=document.local_path.name,
**self.partition_kwargs,
)

elements = chunk_elements(elements, **self.chunking_kwargs)
return [_to_text_element(element, document_meta) for element in elements]


def _to_text_element(element: UnstructuredElement, document_meta: DocumentMeta) -> TextElement:
_document_meta = deepcopy(document_meta)
if element.metadata:
_document_meta.add_location_metadata(provider_metadata=element.metadata.to_dict())
return TextElement(
document_meta=document_meta,
document_meta=_document_meta,
content=element.text,
)

Expand Down

0 comments on commit 7c929a6

Please sign in to comment.