Remove base class check

deepsense-ai · Oct 22, 2024 · cf3817a · cf3817a
1 parent d3c628f
commit cf3817a
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 21 deletions.
diff --git a/...s-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py b/...s-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py
@@ -4,7 +4,6 @@
 
 from unstructured.chunking.basic import chunk_elements
 from unstructured.documents.elements import Element as UnstructuredElement
-from unstructured.documents.elements import ElementType
 from unstructured.partition.auto import partition
 from unstructured.staging.base import elements_from_dicts
 from unstructured_client import UnstructuredClient
@@ -143,23 +142,11 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]:
         return await self._chunk_and_convert(elements, document_meta, document.local_path)
 
     async def _chunk_and_convert(
-        self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path
+        # pylint: disable=unused-argument
+        self,
+        elements: list[UnstructuredElement],
+        document_meta: DocumentMeta,
+        document_path: Path,
     ) -> list[Element]:
-        if self.__class__ == UnstructuredDefaultProvider:
-            chunked_elements = chunk_elements(elements, **self.chunking_kwargs)
-            return [to_text_element(element, document_meta) for element in chunked_elements]
-        image_elements = [e for e in elements if e.category == ElementType.IMAGE]
-        other_elements = [e for e in elements if e.category != ElementType.IMAGE]
-        chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs)
-
-        text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements]
-        if self.ignore_images:
-            return text_elements
-        return text_elements + [
-            await self._to_image_element(element, document_meta, document_path) for element in image_elements
-        ]
-
-    async def _to_image_element(
-        self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path
-    ) -> Element:
-        raise NotImplementedError("UnstructuredDefaultProvider doesn't support image conversion")
+        chunked_elements = chunk_elements(elements, **self.chunking_kwargs)
+        return [to_text_element(element, document_meta) for element in chunked_elements]
diff --git a/...ts-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py b/...ts-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py
@@ -2,16 +2,19 @@
 from typing import Optional
 
 from PIL import Image
+from unstructured.chunking.basic import chunk_elements
 from unstructured.documents.elements import Element as UnstructuredElement
+from unstructured.documents.elements import ElementType
 
 from ragbits.core.llms.litellm import LiteLLMOptions
 from ragbits.document_search.documents.document import DocumentMeta, DocumentType
-from ragbits.document_search.documents.element import ImageElement
+from ragbits.document_search.documents.element import Element, ImageElement
 from ragbits.document_search.ingestion.providers.unstructured.default import UnstructuredDefaultProvider
 from ragbits.document_search.ingestion.providers.unstructured.utils import (
     ImageDescriber,
     crop_and_convert_to_bytes,
     extract_image_coordinates,
+    to_text_element,
 )
 
 DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL = "gpt-4o-mini"
@@ -56,6 +59,20 @@ def __init__(
             llm_model_name or DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL, llm_options or DEFAULT_LLM_OPTIONS
         )
 
+    async def _chunk_and_convert(
+        self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path
+    ) -> list[Element]:
+        image_elements = [e for e in elements if e.category == ElementType.IMAGE]
+        other_elements = [e for e in elements if e.category != ElementType.IMAGE]
+        chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs)
+
+        text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements]
+        if self.ignore_images:
+            return text_elements
+        return text_elements + [
+            await self._to_image_element(element, document_meta, document_path) for element in image_elements
+        ]
+
     async def _to_image_element(
         self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path
     ) -> ImageElement: