-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f060b59
commit dc51638
Showing
5 changed files
with
138 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import os | ||
|
||
|
||
def env_vars_not_set(env_vars: list[str]) -> bool: | ||
return all([os.environ.get(env_var) is None for env_var in env_vars]) |
55 changes: 55 additions & 0 deletions
55
packages/ragbits-document-search/tests/unit/test_document_processor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from ragbits.document_search.documents.document import DocumentMeta, DocumentType | ||
from ragbits.document_search.ingestion.document_processor import DocumentProcessor | ||
from ragbits.document_search.ingestion.providers.dummy import DummyProvider | ||
from ragbits.document_search.ingestion.providers.unstructured import ( | ||
UNSTRUCTURED_API_KEY_ENV, | ||
UNSTRUCTURED_API_URL_ENV, | ||
UnstructuredProvider, | ||
) | ||
|
||
from ..helpers import env_vars_not_set | ||
|
||
|
||
async def test_document_processor_processes_text_document_with_dummy_provider(): | ||
providers_config = {DocumentType.TXT: DummyProvider()} | ||
document_processor = DocumentProcessor.from_config(providers_config) | ||
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George") | ||
|
||
elements = await document_processor.process(document_meta) | ||
|
||
assert isinstance(document_processor._providers[DocumentType.TXT], DummyProvider) | ||
assert len(elements) == 1 | ||
assert elements[0].content == "Name of Peppa's brother is George" | ||
|
||
|
||
@pytest.mark.skipif( | ||
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), | ||
reason="Unstructured API environment variables not set", | ||
) | ||
async def test_document_processor_processes_text_document_with_unstructured_provider(): | ||
document_processor = DocumentProcessor.from_config() | ||
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") | ||
|
||
elements = await document_processor.process(document_meta) | ||
|
||
assert isinstance(document_processor._providers[DocumentType.TXT], UnstructuredProvider) | ||
assert len(elements) == 1 | ||
assert elements[0].content == "Name of Peppa's brother is George" | ||
|
||
|
||
@pytest.mark.skipif( | ||
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), | ||
reason="Unstructured API environment variables not set", | ||
) | ||
async def test_document_processor_processes_md_document_with_unstructured_provider(): | ||
document_processor = DocumentProcessor.from_config() | ||
document_meta = DocumentMeta.from_local_path(Path(__file__).parent.parent.parent.parent.parent / "README.md") | ||
|
||
elements = await document_processor.process(document_meta) | ||
|
||
assert len(elements) > 0 | ||
assert elements[0].content == "Ragbits" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import pytest | ||
from dotenv import load_dotenv | ||
|
||
from ragbits.document_search.documents.document import DocumentMeta, DocumentType | ||
from ragbits.document_search.ingestion.providers.base import DocumentTypeNotSupportedError | ||
from ragbits.document_search.ingestion.providers.unstructured import ( | ||
DEFAULT_PARTITION_KWARGS, | ||
UNSTRUCTURED_API_KEY_ENV, | ||
UNSTRUCTURED_API_URL_ENV, | ||
UnstructuredProvider, | ||
) | ||
|
||
from ..helpers import env_vars_not_set | ||
|
||
load_dotenv() | ||
|
||
|
||
@pytest.mark.parametrize("document_type", UnstructuredProvider.SUPPORTED_DOCUMENT_TYPES) | ||
def test_unsupported_provider_validates_supported_document_types_passes(document_type: DocumentType): | ||
UnstructuredProvider().validate_document_type(document_type) | ||
|
||
|
||
def test_unsupported_provider_validates_supported_document_types_fails(): | ||
with pytest.raises(DocumentTypeNotSupportedError) as err: | ||
UnstructuredProvider().validate_document_type(DocumentType.UNKNOWN) | ||
|
||
assert "Document type unknown is not supported by the UnstructuredProvider" in str(err.value) | ||
|
||
|
||
async def test_unstructured_provider_raises_value_error_when_api_key_not_set(): | ||
with pytest.raises(ValueError) as err: | ||
await UnstructuredProvider().process( | ||
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") | ||
) | ||
|
||
assert f"{UNSTRUCTURED_API_KEY_ENV} environment variable is not set" in str(err.value) | ||
|
||
|
||
async def test_unstructured_provider_raises_value_error_when_api_url_not_set(monkeypatch: pytest.MonkeyPatch): | ||
monkeypatch.setenv(UNSTRUCTURED_API_KEY_ENV, "dummy_key") | ||
with pytest.raises(ValueError) as err: | ||
await UnstructuredProvider().process( | ||
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") | ||
) | ||
|
||
assert f"{UNSTRUCTURED_API_URL_ENV} environment variable is not set" in str(err.value) | ||
|
||
|
||
@pytest.mark.skipif( | ||
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), | ||
reason="Unstructured API environment variables not set", | ||
) | ||
async def test_unstructured_provider_document_with_default_partition_kwargs(): | ||
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") | ||
unstructured_provider = UnstructuredProvider() | ||
elements = await unstructured_provider.process(document_meta) | ||
|
||
assert unstructured_provider.partition_kwargs == DEFAULT_PARTITION_KWARGS | ||
assert len(elements) == 1 | ||
assert elements[0].content == "Name of Peppa's brother is George." | ||
|
||
|
||
@pytest.mark.skipif( | ||
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), | ||
reason="Unstructured API environment variables not set", | ||
) | ||
async def test_unstructured_provider_document_with_custom_partition_kwargs(): | ||
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") | ||
partition_kwargs = {"languages": ["pl"], "strategy": "fast"} | ||
unstructured_provider = UnstructuredProvider(partition_kwargs=partition_kwargs) | ||
elements = await unstructured_provider.process(document_meta) | ||
|
||
assert unstructured_provider.partition_kwargs == partition_kwargs | ||
assert len(elements) == 1 | ||
assert elements[0].content == "Name of Peppa's brother is George." |