-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
1,237 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
pkg_resources | ||
tiktoken | ||
tiktoken | ||
chardet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 66 additions & 17 deletions
83
packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,84 @@ | ||
""" | ||
TODO: This module is mocked. To be deleted and replaced with a real implementation. | ||
""" | ||
import copy | ||
from typing import Optional | ||
|
||
from typing import List | ||
from ragbits.document_search.documents.document import DocumentMeta, DocumentType | ||
from ragbits.document_search.documents.element import Element | ||
from ragbits.document_search.ingestion.providers.base import BaseProvider | ||
from ragbits.document_search.ingestion.providers.unstructured import UnstructuredProvider | ||
|
||
from ragbits.document_search.documents.document import DocumentMeta, TextDocument | ||
from ragbits.document_search.documents.element import Element, TextElement | ||
ProvidersConfig = dict[DocumentType, BaseProvider] | ||
|
||
DEFAULT_PROVIDERS_CONFIG: ProvidersConfig = { | ||
DocumentType.TXT: UnstructuredProvider(), | ||
DocumentType.MD: UnstructuredProvider(), | ||
DocumentType.PDF: UnstructuredProvider(), | ||
DocumentType.DOCX: UnstructuredProvider(), | ||
DocumentType.DOC: UnstructuredProvider(), | ||
DocumentType.PPTX: UnstructuredProvider(), | ||
DocumentType.PPT: UnstructuredProvider(), | ||
DocumentType.XLSX: UnstructuredProvider(), | ||
DocumentType.XLS: UnstructuredProvider(), | ||
DocumentType.CSV: UnstructuredProvider(), | ||
DocumentType.HTML: UnstructuredProvider(), | ||
DocumentType.EPUB: UnstructuredProvider(), | ||
DocumentType.ORG: UnstructuredProvider(), | ||
DocumentType.ODT: UnstructuredProvider(), | ||
DocumentType.RST: UnstructuredProvider(), | ||
DocumentType.RTF: UnstructuredProvider(), | ||
DocumentType.TSV: UnstructuredProvider(), | ||
DocumentType.XML: UnstructuredProvider(), | ||
} | ||
|
||
|
||
class DocumentProcessor: | ||
""" | ||
A class with an implementation of Document Processor, allowing to process documents. | ||
TODO: probably this one should be replaced with something more generic, | ||
allowing for passing different processors for different document types. | ||
""" | ||
|
||
async def process(self, document_meta: DocumentMeta) -> List[Element]: | ||
def __init__(self, providers: dict[DocumentType, BaseProvider]): | ||
self._providers = providers | ||
|
||
@classmethod | ||
def from_config(cls, providers_config: Optional[ProvidersConfig] = None) -> "DocumentProcessor": | ||
""" | ||
Create a DocumentProcessor from a configuration. If the configuration is not provided, the default configuration | ||
will be used. If the configuration is provided, it will be merged with the default configuration, overriding | ||
the default values for the document types that are defined in the configuration. | ||
Example of the configuration: | ||
{ | ||
DocumentType.TXT: YourCustomProviderClass(), | ||
DocumentType.PDF: UnstructuredProvider(), | ||
} | ||
Args: | ||
providers_config: The dictionary with the providers configuration, mapping the document types to the | ||
provider class. | ||
Returns: | ||
The DocumentProcessor. | ||
""" | ||
config = copy.deepcopy(DEFAULT_PROVIDERS_CONFIG) | ||
config.update(providers_config if providers_config is not None else {}) | ||
|
||
return cls(providers=config) | ||
|
||
async def process(self, document_meta: DocumentMeta) -> list[Element]: | ||
""" | ||
Process the document. | ||
Args: | ||
document_meta: The document to process. | ||
Returns: | ||
The processed elements. | ||
""" | ||
document = await document_meta.fetch() | ||
The list of elements extracted from the document. | ||
if isinstance(document, TextDocument): | ||
# for now just return the whole document as a single element | ||
return [TextElement(document=document_meta, content=document.content)] | ||
Raises: | ||
ValueError: If the provider for the document type is not defined in the configuration. | ||
""" | ||
provider = self._providers.get(document_meta.document_type) | ||
if provider is None: | ||
raise ValueError( | ||
f"Provider for {document_meta.document_type} is not defined in the configuration:" f" {self._providers}" | ||
) | ||
|
||
return [] | ||
return await provider.process(document_meta) |
Empty file.
41 changes: 41 additions & 0 deletions
41
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
from ragbits.document_search.documents.document import DocumentMeta, DocumentType | ||
from ragbits.document_search.documents.element import Element | ||
|
||
|
||
class DocumentTypeNotSupportedError(Exception): | ||
"""Raised when the document type is not supported by the provider.""" | ||
|
||
def __init__(self, provider_name: str, document_type: DocumentType) -> None: | ||
message = f"Document type {document_type} is not supported by the {provider_name}" | ||
super().__init__(message) | ||
|
||
|
||
class BaseProvider(ABC): | ||
"""A base class for the document processing providers.""" | ||
|
||
SUPPORTED_DOCUMENT_TYPES: set[DocumentType] | ||
|
||
@abstractmethod | ||
async def process(self, document_meta: DocumentMeta) -> list[Element]: | ||
"""Process the document. | ||
Args: | ||
document_meta: The document to process. | ||
Returns: | ||
The list of elements extracted from the document. | ||
""" | ||
|
||
def validate_document_type(self, document_type: DocumentType) -> None: | ||
"""Check if the provider supports the document type. | ||
Args: | ||
document_type: The document type. | ||
Raises: | ||
DocumentTypeNotSupportedError: If the document type is not supported. | ||
""" | ||
if document_type not in self.SUPPORTED_DOCUMENT_TYPES: | ||
raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type) |
29 changes: 29 additions & 0 deletions
29
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/dummy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from ragbits.document_search.documents.document import DocumentMeta, DocumentType, TextDocument | ||
from ragbits.document_search.documents.element import Element, TextElement | ||
from ragbits.document_search.ingestion.providers.base import BaseProvider | ||
|
||
|
||
class DummyProvider(BaseProvider): | ||
"""This is a mock provider that returns a TextElement with the content of the document. | ||
It should be used for testing purposes only. | ||
TODO: Remove this provider after the implementation of the real providers. | ||
""" | ||
|
||
SUPPORTED_DOCUMENT_TYPES = {DocumentType.TXT} | ||
|
||
async def process(self, document_meta: DocumentMeta) -> list[Element]: | ||
"""Process the text document. | ||
Args: | ||
document_meta: The document to process. | ||
Returns: | ||
List with a single TextElement containing the content of the document. | ||
""" | ||
self.validate_document_type(document_meta.document_type) | ||
|
||
document = await document_meta.fetch() | ||
if isinstance(document, TextDocument): | ||
return [TextElement(content=document.content, document_meta=document_meta)] | ||
return [] |
Oops, something went wrong.