From 1332c74d98363d87ff38824eb30b51408eb3cf41 Mon Sep 17 00:00:00 2001 From: Alan Konarski <alan.konarski@deepsense.ai> Date: Mon, 23 Sep 2024 11:57:23 +0200 Subject: [PATCH] Move unstructured tests to integration tests folder --- .../tests/integration/test_unstructured.py | 72 +++++++++++++++++++ .../tests/unit/test_document_processor.py | 40 ----------- .../tests/unit/test_providers.py | 32 --------- 3 files changed, 72 insertions(+), 72 deletions(-) create mode 100644 packages/ragbits-document-search/tests/integration/test_unstructured.py diff --git a/packages/ragbits-document-search/tests/integration/test_unstructured.py b/packages/ragbits-document-search/tests/integration/test_unstructured.py new file mode 100644 index 00000000..a48c1f49 --- /dev/null +++ b/packages/ragbits-document-search/tests/integration/test_unstructured.py @@ -0,0 +1,72 @@ +from pathlib import Path + +import pytest + +from ragbits.document_search.documents.document import DocumentMeta, DocumentType +from ragbits.document_search.ingestion.document_processor import DocumentProcessor +from ragbits.document_search.ingestion.providers.unstructured import ( + DEFAULT_PARTITION_KWARGS, + UNSTRUCTURED_API_KEY_ENV, + UNSTRUCTURED_API_URL_ENV, + UnstructuredProvider, +) + +from ..helpers import env_vars_not_set + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_document_processor_processes_text_document_with_unstructured_provider(): + document_processor = DocumentProcessor.from_config() + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + + elements = await document_processor.process(document_meta) + + assert isinstance(document_processor._providers[DocumentType.TXT], UnstructuredProvider) + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George" + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_document_processor_processes_md_document_with_unstructured_provider(): + document_processor = DocumentProcessor.from_config() + document_meta = DocumentMeta.from_local_path(Path(__file__).parent.parent.parent.parent.parent / "README.md") + + elements = await document_processor.process(document_meta) + + assert len(elements) > 0 + assert elements[0].content == "Ragbits" + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_unstructured_provider_document_with_default_partition_kwargs(): + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + unstructured_provider = UnstructuredProvider() + elements = await unstructured_provider.process(document_meta) + + assert unstructured_provider.partition_kwargs == DEFAULT_PARTITION_KWARGS + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George." + + +@pytest.mark.skipif( + env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), + reason="Unstructured API environment variables not set", +) +async def test_unstructured_provider_document_with_custom_partition_kwargs(): + document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") + partition_kwargs = {"languages": ["pl"], "strategy": "fast"} + unstructured_provider = UnstructuredProvider(partition_kwargs=partition_kwargs) + elements = await unstructured_provider.process(document_meta) + + assert unstructured_provider.partition_kwargs == partition_kwargs + assert len(elements) == 1 + assert elements[0].content == "Name of Peppa's brother is George." diff --git a/packages/ragbits-document-search/tests/unit/test_document_processor.py b/packages/ragbits-document-search/tests/unit/test_document_processor.py index 0e930ff1..f329e8b3 100644 --- a/packages/ragbits-document-search/tests/unit/test_document_processor.py +++ b/packages/ragbits-document-search/tests/unit/test_document_processor.py @@ -1,17 +1,6 @@ -from pathlib import Path - -import pytest - from ragbits.document_search.documents.document import DocumentMeta, DocumentType from ragbits.document_search.ingestion.document_processor import DocumentProcessor from ragbits.document_search.ingestion.providers.dummy import DummyProvider -from ragbits.document_search.ingestion.providers.unstructured import ( - UNSTRUCTURED_API_KEY_ENV, - UNSTRUCTURED_API_URL_ENV, - UnstructuredProvider, -) - -from ..helpers import env_vars_not_set async def test_document_processor_processes_text_document_with_dummy_provider(): @@ -24,32 +13,3 @@ async def test_document_processor_processes_text_document_with_dummy_provider(): assert isinstance(document_processor._providers[DocumentType.TXT], DummyProvider) assert len(elements) == 1 assert elements[0].content == "Name of Peppa's brother is George" - - -@pytest.mark.skipif( - env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), - reason="Unstructured API environment variables not set", -) -async def test_document_processor_processes_text_document_with_unstructured_provider(): - document_processor = DocumentProcessor.from_config() - document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") - - elements = await document_processor.process(document_meta) - - assert isinstance(document_processor._providers[DocumentType.TXT], UnstructuredProvider) - assert len(elements) == 1 - assert elements[0].content == "Name of Peppa's brother is George" - - -@pytest.mark.skipif( - env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), - reason="Unstructured API environment variables not set", -) -async def test_document_processor_processes_md_document_with_unstructured_provider(): - document_processor = DocumentProcessor.from_config() - document_meta = DocumentMeta.from_local_path(Path(__file__).parent.parent.parent.parent.parent / "README.md") - - elements = await document_processor.process(document_meta) - - assert len(elements) > 0 - assert elements[0].content == "Ragbits" diff --git a/packages/ragbits-document-search/tests/unit/test_providers.py b/packages/ragbits-document-search/tests/unit/test_providers.py index 7da2570c..5bde8e52 100644 --- a/packages/ragbits-document-search/tests/unit/test_providers.py +++ b/packages/ragbits-document-search/tests/unit/test_providers.py @@ -4,14 +4,11 @@ from ragbits.document_search.documents.document import DocumentMeta, DocumentType from ragbits.document_search.ingestion.providers.base import DocumentTypeNotSupportedError from ragbits.document_search.ingestion.providers.unstructured import ( - DEFAULT_PARTITION_KWARGS, UNSTRUCTURED_API_KEY_ENV, UNSTRUCTURED_API_URL_ENV, UnstructuredProvider, ) -from ..helpers import env_vars_not_set - load_dotenv() @@ -44,32 +41,3 @@ async def test_unstructured_provider_raises_value_error_when_api_url_not_set(mon ) assert f"{UNSTRUCTURED_API_URL_ENV} environment variable is not set" in str(err.value) - - -@pytest.mark.skipif( - env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), - reason="Unstructured API environment variables not set", -) -async def test_unstructured_provider_document_with_default_partition_kwargs(): - document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") - unstructured_provider = UnstructuredProvider() - elements = await unstructured_provider.process(document_meta) - - assert unstructured_provider.partition_kwargs == DEFAULT_PARTITION_KWARGS - assert len(elements) == 1 - assert elements[0].content == "Name of Peppa's brother is George." - - -@pytest.mark.skipif( - env_vars_not_set([UNSTRUCTURED_API_URL_ENV, UNSTRUCTURED_API_KEY_ENV]), - reason="Unstructured API environment variables not set", -) -async def test_unstructured_provider_document_with_custom_partition_kwargs(): - document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.") - partition_kwargs = {"languages": ["pl"], "strategy": "fast"} - unstructured_provider = UnstructuredProvider(partition_kwargs=partition_kwargs) - elements = await unstructured_provider.process(document_meta) - - assert unstructured_provider.partition_kwargs == partition_kwargs - assert len(elements) == 1 - assert elements[0].content == "Name of Peppa's brother is George."