From cf3817ac5d4674e1d2033904b4b8d7636199249d Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 22 Oct 2024 07:19:46 +0200 Subject: [PATCH] Remove base class check --- .../providers/unstructured/default.py | 27 +++++-------------- .../providers/unstructured/images.py | 19 ++++++++++++- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py index 3151a1e8..d6c92681 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py @@ -4,7 +4,6 @@ from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import Element as UnstructuredElement -from unstructured.documents.elements import ElementType from unstructured.partition.auto import partition from unstructured.staging.base import elements_from_dicts from unstructured_client import UnstructuredClient @@ -143,23 +142,11 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]: return await self._chunk_and_convert(elements, document_meta, document.local_path) async def _chunk_and_convert( - self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path + # pylint: disable=unused-argument + self, + elements: list[UnstructuredElement], + document_meta: DocumentMeta, + document_path: Path, ) -> list[Element]: - if self.__class__ == UnstructuredDefaultProvider: - chunked_elements = chunk_elements(elements, **self.chunking_kwargs) - return [to_text_element(element, document_meta) for element in chunked_elements] - image_elements = [e for e in elements if e.category == ElementType.IMAGE] - other_elements = [e for e in elements if e.category != ElementType.IMAGE] - chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs) - - text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements] - if self.ignore_images: - return text_elements - return text_elements + [ - await self._to_image_element(element, document_meta, document_path) for element in image_elements - ] - - async def _to_image_element( - self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path - ) -> Element: - raise NotImplementedError("UnstructuredDefaultProvider doesn't support image conversion") + chunked_elements = chunk_elements(elements, **self.chunking_kwargs) + return [to_text_element(element, document_meta) for element in chunked_elements] diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py index 681a8fb7..f137724f 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py @@ -2,16 +2,19 @@ from typing import Optional from PIL import Image +from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import Element as UnstructuredElement +from unstructured.documents.elements import ElementType from ragbits.core.llms.litellm import LiteLLMOptions from ragbits.document_search.documents.document import DocumentMeta, DocumentType -from ragbits.document_search.documents.element import ImageElement +from ragbits.document_search.documents.element import Element, ImageElement from ragbits.document_search.ingestion.providers.unstructured.default import UnstructuredDefaultProvider from ragbits.document_search.ingestion.providers.unstructured.utils import ( ImageDescriber, crop_and_convert_to_bytes, extract_image_coordinates, + to_text_element, ) DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL = "gpt-4o-mini" @@ -56,6 +59,20 @@ def __init__( llm_model_name or DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL, llm_options or DEFAULT_LLM_OPTIONS ) + async def _chunk_and_convert( + self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path + ) -> list[Element]: + image_elements = [e for e in elements if e.category == ElementType.IMAGE] + other_elements = [e for e in elements if e.category != ElementType.IMAGE] + chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs) + + text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements] + if self.ignore_images: + return text_elements + return text_elements + [ + await self._to_image_element(element, document_meta, document_path) for element in image_elements + ] + async def _to_image_element( self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path ) -> ImageElement: