Skip to content

Commit

Permalink
feat(document-search): add chunking in unstructured provider
Browse files Browse the repository at this point in the history
  • Loading branch information
mhordynski committed Sep 26, 2024
1 parent 73992af commit 8886d5b
Showing 1 changed file with 6 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import BytesIO
from typing import Optional

from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Element as UnstructuredElement
from unstructured.partition.api import partition_via_api

Expand All @@ -17,6 +18,8 @@
"split_pdf_concurrency_level": 15,
}

DEFAULT_CHUNKING_KWARGS: dict = {}

UNSTRUCTURED_API_KEY_ENV = "UNSTRUCTURED_API_KEY"
UNSTRUCTURED_API_URL_ENV = "UNSTRUCTURED_API_URL"

Expand Down Expand Up @@ -47,14 +50,15 @@ class UnstructuredProvider(BaseProvider):
DocumentType.XML,
}

def __init__(self, partition_kwargs: Optional[dict] = None):
def __init__(self, partition_kwargs: Optional[dict] = None, chunking_kwargs: Optional[dict] = None):
"""Initialize the UnstructuredProvider.
Args:
partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
"""
self.partition_kwargs = partition_kwargs or DEFAULT_PARTITION_KWARGS
self.chunking_kwargs = chunking_kwargs or DEFAULT_CHUNKING_KWARGS

async def process(self, document_meta: DocumentMeta) -> list[Element]:
"""Process the document using the Unstructured API.
Expand Down Expand Up @@ -86,6 +90,7 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]:
api_url=api_url,
**self.partition_kwargs,
)
elements = chunk_elements(elements, **self.chunking_kwargs)
return [_to_text_element(element, document_meta) for element in elements]


Expand Down

0 comments on commit 8886d5b

Please sign in to comment.