Skip to content

Commit

Permalink
Remove base class check
Browse files Browse the repository at this point in the history
  • Loading branch information
konrad-czarnota-ds committed Oct 22, 2024
1 parent d3c628f commit cf3817a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Element as UnstructuredElement
from unstructured.documents.elements import ElementType
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_from_dicts
from unstructured_client import UnstructuredClient
Expand Down Expand Up @@ -143,23 +142,11 @@ async def process(self, document_meta: DocumentMeta) -> list[Element]:
return await self._chunk_and_convert(elements, document_meta, document.local_path)

async def _chunk_and_convert(
self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path
# pylint: disable=unused-argument
self,
elements: list[UnstructuredElement],
document_meta: DocumentMeta,
document_path: Path,
) -> list[Element]:
if self.__class__ == UnstructuredDefaultProvider:
chunked_elements = chunk_elements(elements, **self.chunking_kwargs)
return [to_text_element(element, document_meta) for element in chunked_elements]
image_elements = [e for e in elements if e.category == ElementType.IMAGE]
other_elements = [e for e in elements if e.category != ElementType.IMAGE]
chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs)

text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements]
if self.ignore_images:
return text_elements
return text_elements + [
await self._to_image_element(element, document_meta, document_path) for element in image_elements
]

async def _to_image_element(
self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path
) -> Element:
raise NotImplementedError("UnstructuredDefaultProvider doesn't support image conversion")
chunked_elements = chunk_elements(elements, **self.chunking_kwargs)
return [to_text_element(element, document_meta) for element in chunked_elements]
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
from typing import Optional

from PIL import Image
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Element as UnstructuredElement
from unstructured.documents.elements import ElementType

from ragbits.core.llms.litellm import LiteLLMOptions
from ragbits.document_search.documents.document import DocumentMeta, DocumentType
from ragbits.document_search.documents.element import ImageElement
from ragbits.document_search.documents.element import Element, ImageElement
from ragbits.document_search.ingestion.providers.unstructured.default import UnstructuredDefaultProvider
from ragbits.document_search.ingestion.providers.unstructured.utils import (
ImageDescriber,
crop_and_convert_to_bytes,
extract_image_coordinates,
to_text_element,
)

DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL = "gpt-4o-mini"
Expand Down Expand Up @@ -56,6 +59,20 @@ def __init__(
llm_model_name or DEFAULT_LLM_IMAGE_SUMMARIZATION_MODEL, llm_options or DEFAULT_LLM_OPTIONS
)

async def _chunk_and_convert(
self, elements: list[UnstructuredElement], document_meta: DocumentMeta, document_path: Path
) -> list[Element]:
image_elements = [e for e in elements if e.category == ElementType.IMAGE]
other_elements = [e for e in elements if e.category != ElementType.IMAGE]
chunked_other_elements = chunk_elements(other_elements, **self.chunking_kwargs)

text_elements: list[Element] = [to_text_element(element, document_meta) for element in chunked_other_elements]
if self.ignore_images:
return text_elements
return text_elements + [
await self._to_image_element(element, document_meta, document_path) for element in image_elements
]

async def _to_image_element(
self, element: UnstructuredElement, document_meta: DocumentMeta, document_path: Path
) -> ImageElement:
Expand Down

0 comments on commit cf3817a

Please sign in to comment.