Skip to content

Commit

Permalink
Add tests for providers
Browse files Browse the repository at this point in the history
  • Loading branch information
akonarski-ds committed Sep 19, 2024
1 parent f060b59 commit dc51638
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class DocumentType(str, Enum):
TSV = "tsv"
XML = "xml"

UNKNOWN = "unknown"


class DocumentMeta(BaseModel):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class UnstructuredProvider(BaseProvider):

SUPPORTED_DOCUMENT_TYPES = {
DocumentType.TXT,
DocumentType.MD,
DocumentType.PDF,
DocumentType.DOCX,
DocumentType.DOC,
Expand Down
5 changes: 5 additions & 0 deletions packages/ragbits-document-search/tests/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import os


def env_vars_not_set(env_vars: list[str]) -> bool:
return all([os.environ.get(env_var) is None for env_var in env_vars])
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from pathlib import Path

import pytest

from ragbits.document_search.documents.document import DocumentMeta, DocumentType
from ragbits.document_search.ingestion.document_processor import DocumentProcessor
from ragbits.document_search.ingestion.providers.dummy import DummyProvider
from ragbits.document_search.ingestion.providers.unstructured import (
UNSTRUCTURED_API_KEY_ENV,
UNSTRUCTURED_API_URL_ENV,
UnstructuredProvider,
)

from ..helpers import env_vars_not_set


async def test_document_processor_processes_text_document_with_dummy_provider():
providers_config = {DocumentType.TXT: DummyProvider()}
document_processor = DocumentProcessor.from_config(providers_config)
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George")

elements = await document_processor.process(document_meta)

assert isinstance(document_processor._providers[DocumentType.TXT], DummyProvider)
assert len(elements) == 1
assert elements[0].content == "Name of Peppa's brother is George"


@pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
)
async def test_document_processor_processes_text_document_with_unstructured_provider():
document_processor = DocumentProcessor.from_config()
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")

elements = await document_processor.process(document_meta)

assert isinstance(document_processor._providers[DocumentType.TXT], UnstructuredProvider)
assert len(elements) == 1
assert elements[0].content == "Name of Peppa's brother is George"


@pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
)
async def test_document_processor_processes_md_document_with_unstructured_provider():
document_processor = DocumentProcessor.from_config()
document_meta = DocumentMeta.from_local_path(Path(__file__).parent.parent.parent.parent.parent / "README.md")

elements = await document_processor.process(document_meta)

assert len(elements) > 0
assert elements[0].content == "Ragbits"
75 changes: 75 additions & 0 deletions packages/ragbits-document-search/tests/unit/test_providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pytest
from dotenv import load_dotenv

from ragbits.document_search.documents.document import DocumentMeta, DocumentType
from ragbits.document_search.ingestion.providers.base import DocumentTypeNotSupportedError
from ragbits.document_search.ingestion.providers.unstructured import (
DEFAULT_PARTITION_KWARGS,
UNSTRUCTURED_API_KEY_ENV,
UNSTRUCTURED_API_URL_ENV,
UnstructuredProvider,
)

from ..helpers import env_vars_not_set

load_dotenv()


@pytest.mark.parametrize("document_type", UnstructuredProvider.SUPPORTED_DOCUMENT_TYPES)
def test_unsupported_provider_validates_supported_document_types_passes(document_type: DocumentType):
UnstructuredProvider().validate_document_type(document_type)


def test_unsupported_provider_validates_supported_document_types_fails():
with pytest.raises(DocumentTypeNotSupportedError) as err:
UnstructuredProvider().validate_document_type(DocumentType.UNKNOWN)

assert "Document type unknown is not supported by the UnstructuredProvider" in str(err.value)


async def test_unstructured_provider_raises_value_error_when_api_key_not_set():
with pytest.raises(ValueError) as err:
await UnstructuredProvider().process(
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
)

assert f"{UNSTRUCTURED_API_KEY_ENV} environment variable is not set" in str(err.value)


async def test_unstructured_provider_raises_value_error_when_api_url_not_set(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv(UNSTRUCTURED_API_KEY_ENV, "dummy_key")
with pytest.raises(ValueError) as err:
await UnstructuredProvider().process(
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
)

assert f"{UNSTRUCTURED_API_URL_ENV} environment variable is not set" in str(err.value)


@pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
)
async def test_unstructured_provider_document_with_default_partition_kwargs():
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
unstructured_provider = UnstructuredProvider()
elements = await unstructured_provider.process(document_meta)

assert unstructured_provider.partition_kwargs == DEFAULT_PARTITION_KWARGS
assert len(elements) == 1
assert elements[0].content == "Name of Peppa's brother is George."


@pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
)
async def test_unstructured_provider_document_with_custom_partition_kwargs():
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
partition_kwargs = {"languages": ["pl"], "strategy": "fast"}
unstructured_provider = UnstructuredProvider(partition_kwargs=partition_kwargs)
elements = await unstructured_provider.process(document_meta)

assert unstructured_provider.partition_kwargs == partition_kwargs
assert len(elements) == 1
assert elements[0].content == "Name of Peppa's brother is George."

0 comments on commit dc51638

Please sign in to comment.