chore(ci): cache NLTK data #594
225 tests run, 208 passed, 9 skipped, 8 failed.
Annotations
github-actions / JUnit Test Report
test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store0]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.in_memory.InMemoryVectorStore object at 0x7fef68dcd5a0>
@pytest.mark.parametrize(
"vector_store",
[
InMemoryVectorStore(),
ChromaVectorStore(
client=EphemeralClient(),
index_name="test_index_name",
),
QdrantVectorStore(
client=AsyncQdrantClient(":memory:"),
index_name="test_index_name",
),
],
)
async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
vector_store: VectorStore,
) -> None:
document_1_content = "This is a test sentence and it should be in the vector store"
document_2_content = "This is another test sentence and it should be removed from the vector store"
document_2_new_content = "This is one more test sentence and it should be added to the vector store"
document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
embedder = AsyncMock()
embedder.embed_text.return_value = [[0.0], [0.0]]
document_search = DocumentSearch(
embedder=embedder,
vector_store=vector_store,
)
> await document_search.ingest([document_1, document_2])
packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef3c421630>
fp = <http.client.HTTPResponse object at 0x7fef3c420d60>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef3c4206a0>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
github-actions / JUnit Test Report
test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store1]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.chroma.ChromaVectorStore object at 0x7fef55bba710>
@pytest.mark.parametrize(
"vector_store",
[
InMemoryVectorStore(),
ChromaVectorStore(
client=EphemeralClient(),
index_name="test_index_name",
),
QdrantVectorStore(
client=AsyncQdrantClient(":memory:"),
index_name="test_index_name",
),
],
)
async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
vector_store: VectorStore,
) -> None:
document_1_content = "This is a test sentence and it should be in the vector store"
document_2_content = "This is another test sentence and it should be removed from the vector store"
document_2_new_content = "This is one more test sentence and it should be added to the vector store"
document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
embedder = AsyncMock()
embedder.embed_text.return_value = [[0.0], [0.0]]
document_search = DocumentSearch(
embedder=embedder,
vector_store=vector_store,
)
> await document_search.ingest([document_1, document_2])
packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef39d8f730>
fp = <http.client.HTTPResponse object at 0x7fef39d8feb0>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef39d8f310>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
github-actions / JUnit Test Report
test_vector_store.test_handling_document_ingestion_with_different_content_and_verifying_replacement[vector_store2]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
vector_store = <ragbits.core.vector_stores.qdrant.QdrantVectorStore object at 0x7fef55d17010>
@pytest.mark.parametrize(
"vector_store",
[
InMemoryVectorStore(),
ChromaVectorStore(
client=EphemeralClient(),
index_name="test_index_name",
),
QdrantVectorStore(
client=AsyncQdrantClient(":memory:"),
index_name="test_index_name",
),
],
)
async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
vector_store: VectorStore,
) -> None:
document_1_content = "This is a test sentence and it should be in the vector store"
document_2_content = "This is another test sentence and it should be removed from the vector store"
document_2_new_content = "This is one more test sentence and it should be added to the vector store"
document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
embedder = AsyncMock()
embedder.embed_text.return_value = [[0.0], [0.0]]
document_search = DocumentSearch(
embedder=embedder,
vector_store=vector_store,
)
> await document_search.ingest([document_1, document_2])
packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py:46:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py:39: in process_documents
elements.extend(await self.process_document(document, processor_router, processor_overwrite))
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef37ed1ed0>
fp = <http.client.HTTPResponse object at 0x7fef37ed2bc0>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef37ed26b0>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
Check failure on line 35 in packages/ragbits-document-search/tests/integration/test_unstructured.py
github-actions / JUnit Test Report
test_unstructured.test_document_processor_processes_text_document_with_unstructured_provider[config0]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
config = {}
@pytest.mark.parametrize(
"config",
[
{},
pytest.param({DocumentType.TXT: UnstructuredDefaultProvider()}),
pytest.param(
{DocumentType.TXT: UnstructuredDefaultProvider(use_api=True)},
marks=pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
),
),
],
)
async def test_document_processor_processes_text_document_with_unstructured_provider(config: ProvidersConfig):
document_processor = DocumentProcessorRouter.from_config(config)
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
> elements = await document_processor.get_provider(document_meta).process(document_meta)
packages/ragbits-document-search/tests/integration/test_unstructured.py:35:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef37e0b010>
fp = <http.client.HTTPResponse object at 0x7fef37e80220>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef37e80100>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
Check failure on line 35 in packages/ragbits-document-search/tests/integration/test_unstructured.py
github-actions / JUnit Test Report
test_unstructured.test_document_processor_processes_text_document_with_unstructured_provider[config1]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
config = {<DocumentType.TXT: 'txt'>: <ragbits.document_search.ingestion.providers.unstructured.default.UnstructuredDefaultProvider object at 0x7fef53b9f070>}
@pytest.mark.parametrize(
"config",
[
{},
pytest.param({DocumentType.TXT: UnstructuredDefaultProvider()}),
pytest.param(
{DocumentType.TXT: UnstructuredDefaultProvider(use_api=True)},
marks=pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
),
),
],
)
async def test_document_processor_processes_text_document_with_unstructured_provider(config: ProvidersConfig):
document_processor = DocumentProcessorRouter.from_config(config)
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
> elements = await document_processor.get_provider(document_meta).process(document_meta)
packages/ragbits-document-search/tests/integration/test_unstructured.py:35:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef37e72e00>
fp = <http.client.HTTPResponse object at 0x7fef37e72980>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef37e73250>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
Check failure on line 96 in packages/ragbits-document-search/tests/integration/test_unstructured.py
github-actions / JUnit Test Report
test_unstructured.test_unstructured_provider_document_with_default_partition_kwargs[False]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False
@pytest.mark.parametrize(
"use_api",
[
False,
pytest.param(
True,
marks=pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
),
),
],
)
async def test_unstructured_provider_document_with_default_partition_kwargs(use_api: bool):
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
unstructured_provider = UnstructuredDefaultProvider(use_api=use_api)
> elements = await unstructured_provider.process(document_meta)
packages/ragbits-document-search/tests/integration/test_unstructured.py:96:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef379b1450>
fp = <http.client.HTTPResponse object at 0x7fef379b1540>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef379b12a0>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
Check failure on line 120 in packages/ragbits-document-search/tests/integration/test_unstructured.py
github-actions / JUnit Test Report
test_unstructured.test_unstructured_provider_document_with_custom_partition_kwargs[False]
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False
@pytest.mark.parametrize(
"use_api",
[
False,
pytest.param(
True,
marks=pytest.mark.skipif(
env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
reason="Unstructured API environment variables not set",
),
),
],
)
async def test_unstructured_provider_document_with_custom_partition_kwargs(use_api: bool):
document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
partition_kwargs = {"languages": ["pl"], "strategy": "fast"}
unstructured_provider = UnstructuredDefaultProvider(use_api=use_api, partition_kwargs=partition_kwargs)
> elements = await unstructured_provider.process(document_meta)
packages/ragbits-document-search/tests/integration/test_unstructured.py:120:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef37e82aa0>
fp = <http.client.HTTPResponse object at 0x7fef37e82b60>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef37e83670>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError
Check failure on line 199 in packages/ragbits-document-search/tests/unit/test_document_search.py
github-actions / JUnit Test Report
test_document_search.test_document_search_with_batched
urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
async def test_document_search_with_batched():
documents = [
DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's father is Daddy Pig"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's mother is Mummy Pig"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Suzy Sheep"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Danny Dog"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Pedro Pony"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Emily Elephant"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Candy Cat"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's teacher is Madame Gazelle"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's doctor is Dr. Brown Bear"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Chloe Pig"),
DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Alexander Pig"),
]
embeddings_mock = AsyncMock()
embeddings_mock.embed_text.return_value = [[0.1, 0.1]] * len(documents)
processing_strategy = BatchedAsyncProcessing(batch_size=5)
vectore_store = InMemoryVectorStore()
document_search = DocumentSearch(
embedder=embeddings_mock,
vector_store=vectore_store,
processing_strategy=processing_strategy,
)
> await document_search.ingest(documents)
packages/ragbits-document-search/tests/unit/test_document_search.py:199:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:157: in ingest
elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:59: in process_documents
responses = await asyncio.gather(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:35: in _process_with_semaphore
return await self.process_document(document, processor_router, processor_overwrite)
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:62: in process_document
return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
returned = await func(*args, **kwargs) # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
_split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
_download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib.request.HTTPDefaultErrorHandler object at 0x7fef3c4219f0>
req = <urllib.request.Request object at 0x7fef37a99420>
fp = <http.client.HTTPResponse object at 0x7fef37a9ac20>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7fef379c0340>
def http_error_default(self, req, fp, code, msg, hdrs):
> raise HTTPError(req.full_url, code, msg, hdrs, fp)
E urllib.error.HTTPError: HTTP Error 403: Forbidden
/usr/lib/python3.10/urllib/request.py:643: HTTPError