Skip to content

chore(ci): cache NLTK data #594

chore(ci): cache NLTK data

chore(ci): cache NLTK data #594

GitHub Actions / JUnit Test Report failed Dec 16, 2024 in 0s

225 tests run, 208 passed, 9 skipped, 8 failed.

Annotations

Check failure on line 46 in packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store0]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.in_memory.InMemoryVectorStore object at 0x7faa5ffdc8e0>

    @pytest.mark.parametrize(
        "vector_store",
        [
            InMemoryVectorStore(),
            ChromaVectorStore(
                client=EphemeralClient(),
                index_name="test_index_name",
            ),
            QdrantVectorStore(
                client=AsyncQdrantClient(":memory:"),
                index_name="test_index_name",
            ),
        ],
    )
    async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
        vector_store: VectorStore,
    ) -> None:
        document_1_content = "This is a test sentence and it should be in the vector store"
        document_2_content = "This is another test sentence and it should be removed from the vector store"
        document_2_new_content = "This is one more test sentence and it should be added to the vector store"
    
        document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
        document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
    
        embedder = AsyncMock()
        embedder.embed_text.return_value = [[0.0], [0.0]]
        document_search = DocumentSearch(
            embedder=embedder,
            vector_store=vector_store,
        )
>       await document_search.ingest([document_1, document_2])

packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
    elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
    elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
    return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa31b1b7f0>
fp = <http.client.HTTPResponse object at 0x7faa31b184f0>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa31b1b490>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 46 in packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store1]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.chroma.ChromaVectorStore object at 0x7faa4d4ee2f0>

    @pytest.mark.parametrize(
        "vector_store",
        [
            InMemoryVectorStore(),
            ChromaVectorStore(
                client=EphemeralClient(),
                index_name="test_index_name",
            ),
            QdrantVectorStore(
                client=AsyncQdrantClient(":memory:"),
                index_name="test_index_name",
            ),
        ],
    )
    async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
        vector_store: VectorStore,
    ) -> None:
        document_1_content = "This is a test sentence and it should be in the vector store"
        document_2_content = "This is another test sentence and it should be removed from the vector store"
        document_2_new_content = "This is one more test sentence and it should be added to the vector store"
    
        document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
        document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
    
        embedder = AsyncMock()
        embedder.embed_text.return_value = [[0.0], [0.0]]
        document_search = DocumentSearch(
            embedder=embedder,
            vector_store=vector_store,
        )
>       await document_search.ingest([document_1, document_2])

packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
    elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
    elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
    return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa316896f0>
fp = <http.client.HTTPResponse object at 0x7faa30dbd330>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa30dbd3c0>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 46 in packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store2]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.qdrant.QdrantVectorStore object at 0x7faa4a8723b0>

    @pytest.mark.parametrize(
        "vector_store",
        [
            InMemoryVectorStore(),
            ChromaVectorStore(
                client=EphemeralClient(),
                index_name="test_index_name",
            ),
            QdrantVectorStore(
                client=AsyncQdrantClient(":memory:"),
                index_name="test_index_name",
            ),
        ],
    )
    async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
        vector_store: VectorStore,
    ) -> None:
        document_1_content = "This is a test sentence and it should be in the vector store"
        document_2_content = "This is another test sentence and it should be removed from the vector store"
        document_2_new_content = "This is one more test sentence and it should be added to the vector store"
    
        document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
        document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
    
        embedder = AsyncMock()
        embedder.embed_text.return_value = [[0.0], [0.0]]
        document_search = DocumentSearch(
            embedder=embedder,
            vector_store=vector_store,
        )
>       await document_search.ingest([document_1, document_2])

packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
    elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
    elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
    return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa315da1d0>
fp = <http.client.HTTPResponse object at 0x7faa315db670>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa315dbb80>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 35 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_document_processor_processes_text_document_with_unstructured_provider[config0]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
config = {}

    @pytest.mark.parametrize(
        "config",
        [
            {},
            pytest.param({DocumentType.TXT: UnstructuredDefaultProvider()}),
            pytest.param(
                {DocumentType.TXT: UnstructuredDefaultProvider(use_api=True)},
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_document_processor_processes_text_document_with_unstructured_provider(config: ProvidersConfig):
        document_processor = DocumentProcessorRouter.from_config(config)
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
    
>       elements = await document_processor.get_provider(document_meta).process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:35: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa2ec8e2f0>
fp = <http.client.HTTPResponse object at 0x7faa2ec8e200>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa4d6e5fc0>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 35 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_document_processor_processes_text_document_with_unstructured_provider[config1]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
config = {<DocumentType.TXT: 'txt'>: <ragbits.document_search.ingestion.providers.unstructured.default.UnstructuredDefaultProvider object at 0x7faa4ade72e0>}

    @pytest.mark.parametrize(
        "config",
        [
            {},
            pytest.param({DocumentType.TXT: UnstructuredDefaultProvider()}),
            pytest.param(
                {DocumentType.TXT: UnstructuredDefaultProvider(use_api=True)},
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_document_processor_processes_text_document_with_unstructured_provider(config: ProvidersConfig):
        document_processor = DocumentProcessorRouter.from_config(config)
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
    
>       elements = await document_processor.get_provider(document_meta).process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:35: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa2e81ea40>
fp = <http.client.HTTPResponse object at 0x7faa2e81f520>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa2e81e9b0>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 96 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_unstructured_provider_document_with_default_partition_kwargs[False]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False

    @pytest.mark.parametrize(
        "use_api",
        [
            False,
            pytest.param(
                True,
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_unstructured_provider_document_with_default_partition_kwargs(use_api: bool):
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
        unstructured_provider = UnstructuredDefaultProvider(use_api=use_api)
>       elements = await unstructured_provider.process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:96: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa2e765780>
fp = <http.client.HTTPResponse object at 0x7faa2e7657e0>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa2e765630>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 120 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_unstructured_provider_document_with_custom_partition_kwargs[False]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False

    @pytest.mark.parametrize(
        "use_api",
        [
            False,
            pytest.param(
                True,
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_unstructured_provider_document_with_custom_partition_kwargs(use_api: bool):
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
        partition_kwargs = {"languages": ["pl"], "strategy": "fast"}
        unstructured_provider = UnstructuredDefaultProvider(use_api=use_api, partition_kwargs=partition_kwargs)
>       elements = await unstructured_provider.process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:120: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa2e812740>
fp = <http.client.HTTPResponse object at 0x7faa2e812920>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa2e811c90>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 199 in packages/ragbits-document-search/tests/unit/test_document_search.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_document_search.test_document_search_with_batched

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
async def test_document_search_with_batched():
        documents = [
            DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's father is Daddy Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's mother is Mummy Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Suzy Sheep"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Danny Dog"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Pedro Pony"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Emily Elephant"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Candy Cat"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's teacher is Madame Gazelle"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's doctor is Dr. Brown Bear"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Chloe Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Alexander Pig"),
        ]
    
        embeddings_mock = AsyncMock()
        embeddings_mock.embed_text.return_value = [[0.1, 0.1]] * len(documents)
    
        processing_strategy = BatchedAsyncProcessing(batch_size=5)
        vectore_store = InMemoryVectorStore()
    
        document_search = DocumentSearch(
            embedder=embeddings_mock,
            vector_store=vectore_store,
            processing_strategy=processing_strategy,
        )
    
>       await document_search.ingest(documents)

packages/ragbits-document-search/tests/unit/test_document_search.py:199: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
    elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:59: in process_documents
    responses = await asyncio.gather(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:35: in _process_with_semaphore
    return await self.process_document(document, processor_router, processor_overwrite)
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
    return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7faa31b1b070>
req = <urllib.request.Request object at 0x7faa2ebdad40>
fp = <http.client.HTTPResponse object at 0x7faa2ebdab30>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7faa2ebdac20>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/urllib/request.py:643: HTTPError