From dc51638611ac0a87d68bc8b2ad37d3491a505dbd Mon Sep 17 00:00:00 2001 From: Alan Konarski Date: Thu, 19 Sep 2024 13:55:18 +0200 Subject: [PATCH] Add tests for providers --- .../document_search/documents/document.py | 2 + .../ingestion/providers/unstructured.py | 1 + .../ragbits-document-search/tests/helpers.py | 5 ++ .../tests/unit/test_document_processor.py | 55 ++++++++++++++ .../tests/unit/test_providers.py | 75 +++++++++++++++++++ 5 files changed, 138 insertions(+) create mode 100644 packages/ragbits-document-search/tests/helpers.py create mode 100644 packages/ragbits-document-search/tests/unit/test_document_processor.py diff --git a/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py b/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py index d7018f44..335e37fc 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/documents/document.py @@ -30,6 +30,8 @@ class DocumentType(str, Enum): TSV = "tsv" XML = "xml" + UNKNOWN = "unknown" + class DocumentMeta(BaseModel): """ diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py index e666cee8..037bed34 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured.py @@ -28,6 +28,7 @@ class UnstructuredProvider(BaseProvider): SUPPORTED_DOCUMENT_TYPES = { DocumentType.TXT, + DocumentType.MD, DocumentType.PDF, DocumentType.DOCX, DocumentType.DOC, diff --git a/packages/ragbits-document-search/tests/helpers.py b/packages/ragbits-document-search/tests/helpers.py new file mode 100644 index 00000000..4ed552e5 --- /dev/null +++ b/packages/ragbits-document-search/tests/helpers.py @@ -0,0 +1,5 @@ +import os + + +def env_vars_not_set(env_vars: list[str]) -> bool: + return all([os.environ.get(env_var) is None for env_var in env_vars]) diff --git a/packages/ragbits-document-search/tests/unit/test_document_processor.py b/packages/ragbits-document-search/tests/unit/test_document_processor.py new file mode 100644 index 00000000..0e930ff1 --- /dev/null +++ b/packages/ragbits-document-search/tests/unit/test_document_processor.py @@ -0,0 +1,55 @@ +from pathlib import Path + +import pytest + +from ragbits.document_search.documents.document import DocumentMeta, DocumentType +from ragbits.document_search.ingestion.document_processor import DocumentProcessor +from ragbits.document_search.ingestion.providers.dummy import DummyProvider +from ragbits.document_search.ingestion.providers.unstructured import ( + UNSTRUCTURED_API_KEY_ENV, + UNSTRUCTURED_API_URL_ENV, + UnstructuredProvider, +) + +from ..helpers import env_vars_not_set + + +async def test_document_processor_processes_text_document_with_dummy_provider(): + providers_config = {DocumentType.TXT: DummyProvider()} + document_processor = DocumentProcessor.from_config(providers_config) + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George") + + elements = await document_processor.process(document_meta) + + assert isinstance(document_processor._providers[DocumentType.TXT], DummyProvider) + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George" + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_document_processor_processes_text_document_with_unstructured_provider(): + document_processor = DocumentProcessor.from_config() + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + + elements = await document_processor.process(document_meta) + + assert isinstance(document_processor._providers[DocumentType.TXT], UnstructuredProvider) + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George" + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_document_processor_processes_md_document_with_unstructured_provider(): + document_processor = DocumentProcessor.from_config() + document_meta = DocumentMeta.from_local_path(Path(__file__).parent.parent.parent.parent.parent / "README.md") + + elements = await document_processor.process(document_meta) + + assert len(elements) > 0 + assert elements[0].content == "Ragbits" diff --git a/packages/ragbits-document-search/tests/unit/test_providers.py b/packages/ragbits-document-search/tests/unit/test_providers.py index e69de29b..7da2570c 100644 --- a/packages/ragbits-document-search/tests/unit/test_providers.py +++ b/packages/ragbits-document-search/tests/unit/test_providers.py @@ -0,0 +1,75 @@ +import pytest +from dotenv import load_dotenv + +from ragbits.document_search.documents.document import DocumentMeta, DocumentType +from ragbits.document_search.ingestion.providers.base import DocumentTypeNotSupportedError +from ragbits.document_search.ingestion.providers.unstructured import ( + DEFAULT_PARTITION_KWARGS, + UNSTRUCTURED_API_KEY_ENV, + UNSTRUCTURED_API_URL_ENV, + UnstructuredProvider, +) + +from ..helpers import env_vars_not_set + +load_dotenv() + + +@pytest.mark.parametrize("document_type", UnstructuredProvider.SUPPORTED_DOCUMENT_TYPES) +def test_unsupported_provider_validates_supported_document_types_passes(document_type: DocumentType): + UnstructuredProvider().validate_document_type(document_type) + + +def test_unsupported_provider_validates_supported_document_types_fails(): + with pytest.raises(DocumentTypeNotSupportedError) as err: + UnstructuredProvider().validate_document_type(DocumentType.UNKNOWN) + + assert "Document type unknown is not supported by the UnstructuredProvider" in str(err.value) + + +async def test_unstructured_provider_raises_value_error_when_api_key_not_set(): + with pytest.raises(ValueError) as err: + await UnstructuredProvider().process( + DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + ) + + assert f"{UNSTRUCTURED_API_KEY_ENV} environment variable is not set" in str(err.value) + + +async def test_unstructured_provider_raises_value_error_when_api_url_not_set(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv(UNSTRUCTURED_API_KEY_ENV, "dummy_key") + with pytest.raises(ValueError) as err: + await UnstructuredProvider().process( + DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + ) + + assert f"{UNSTRUCTURED_API_URL_ENV} environment variable is not set" in str(err.value) + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_unstructured_provider_document_with_default_partition_kwargs(): + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + unstructured_provider = UnstructuredProvider() + elements = await unstructured_provider.process(document_meta) + + assert unstructured_provider.partition_kwargs == DEFAULT_PARTITION_KWARGS + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George." + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_unstructured_provider_document_with_custom_partition_kwargs(): + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + partition_kwargs = {"languages": ["pl"], "strategy": "fast"} + unstructured_provider = UnstructuredProvider(partition_kwargs=partition_kwargs) + elements = await unstructured_provider.process(document_meta) + + assert unstructured_provider.partition_kwargs == partition_kwargs + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George."