From c14a054f75b5fb38e70ab9ce0efa5c791f5b1660 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 12 Feb 2024 16:38:06 +0100 Subject: [PATCH 1/3] rename retriever --- integrations/pinecone/examples/example.py | 50 + .../pinecone_documentstore_example.ipynb | 1298 ----------------- integrations/pinecone/pyproject.toml | 2 + .../retrievers/pinecone/__init__.py | 4 +- ...se_retriever.py => embedding_retriever.py} | 6 +- .../pinecone/tests/test_document_store.py | 6 +- ...riever.py => test_emebedding_retriever.py} | 14 +- integrations/pinecone/tests/test_filters.py | 33 +- 8 files changed, 90 insertions(+), 1323 deletions(-) create mode 100644 integrations/pinecone/examples/example.py delete mode 100644 integrations/pinecone/examples/pinecone_documentstore_example.ipynb rename integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/{dense_retriever.py => embedding_retriever.py} (93%) rename integrations/pinecone/tests/{test_dense_retriever.py => test_emebedding_retriever.py} (89%) diff --git a/integrations/pinecone/examples/example.py b/integrations/pinecone/examples/example.py new file mode 100644 index 000000000..b2a534452 --- /dev/null +++ b/integrations/pinecone/examples/example.py @@ -0,0 +1,50 @@ +# Install the Pinecone integration, Haystack will come as a dependency +# Install also some optional dependencies needed for Markdown conversion and text embedding +# pip install -U pinecone-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" + +# Download some markdown files to index +# git clone https://github.com/anakin87/neural-search-pills + + +# Create the indexing Pipeline and index some documents + +import glob + +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from pinecone_haystack import PineconeDocumentStore +from pinecone_haystack.dense_retriever import PineconeEmbeddingRetriever + +file_paths = glob.glob("neural-search-pills/pills/*.md") + +document_store = PineconeDocumentStore( + api_key="YOUR-PINECONE-API-KEY", environment="gcp-starter", index="default", namespace="default", dimension=768 +) + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + + +# Create the querying Pipeline and try a query + +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store, top_k=3)) +querying.connect("embedder", "retriever") + +results = querying.run({"embedder": {"text": "What is Question Answering?"}}) + +for doc in results["retriever"]["documents"]: + print(doc) + print("-" * 10) diff --git a/integrations/pinecone/examples/pinecone_documentstore_example.ipynb b/integrations/pinecone/examples/pinecone_documentstore_example.ipynb deleted file mode 100644 index 70849836b..000000000 --- a/integrations/pinecone/examples/pinecone_documentstore_example.ipynb +++ /dev/null @@ -1,1298 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TFIFYm41SS8C" - }, - "outputs": [], - "source": [ - "# Install the Pinecone integration, Haystack will come as a dependency\n", - "# Install also some optional dependencies needed for Markdown conversion and text embedding\n", - "!pip install -U pinecone-haystack markdown-it-py mdit_plain \"sentence-transformers>=2.2.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4sTKYa3qbRAi", - "outputId": "7e8dcbb9-b330-4dab-8516-0caf0520b315" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cloning into 'neural-search-pills'...\n", - "remote: Enumerating objects: 190, done.\u001b[K\n", - "remote: Counting objects: 100% (190/190), done.\u001b[K\n", - "remote: Compressing objects: 100% (136/136), done.\u001b[K\n", - "remote: Total 190 (delta 97), reused 130 (delta 51), pack-reused 0\u001b[K\n", - "Receiving objects: 100% (190/190), 1.38 MiB | 21.77 MiB/s, done.\n", - "Resolving deltas: 100% (97/97), done.\n" - ] - } - ], - "source": [ - "# Download some markdown files to index\n", - "!git clone https://github.com/anakin87/neural-search-pills" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153, - "referenced_widgets": [ - "c46e6d0699ff4fbcb8012b7076f5017d", - "5d94db6d00fb4d1abfbaa02f79bdeb78", - "98fce152d7cd45408bd5151574908310", - "3589c7ca39324b359c2378b944974669", - "b7d283c62ebc4199bc2cd6e3bfcb94bc", - "e706a6825bfe4735a0b83eb0dba20286", - "653f00da627740c39b740cd1bdb9e5ff", - "4fbcba61b6724cfc9c174a4b5c4fdf01", - "ed89a5c30ca24d45af85a8a5a61fae04", - "1a423d196704471481ed04a31e22e428", - "c641f6f19bd24af7874805097002dace", - "da90aa6e98de4e80a38a205d40695dcc", - "522ddaca7d26497d8e84a9be785f2222", - "a4c7565d2e5a4ac0aaa9bef88d8cc25f", - "efbd19a1674f4a6b8322ecf1a2e11591", - "fef14c506fef4ba5bc0c65e541e918b6", - "e2fe8960681b4c918eaedc6952e2534f", - "4d37fc2fea964cd590422097ab9a44bd", - "b2f99035deec45b187fe85813fb5bef4", - "e2238770d45047e998719cfb0ced7f2b", - "444ae4a1444b4f74ab45ca56aa8da8dc", - "9f22449bf9ba4ca0be6ad93656fed1d7" - ] - }, - "id": "CacG5dsTTT8B", - "outputId": "afac15af-7b51-4d7c-a301-004cbf333e9a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Converting markdown files to Documents: 100%|██████████| 14/14 [00:00<00:00, 163.72it/s]\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c46e6d0699ff4fbcb8012b7076f5017d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/5 [00:00 Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "PineconeDenseRetriever": + def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever": data["init_parameters"]["document_store"] = default_from_dict( PineconeDocumentStore, data["init_parameters"]["document_store"] ) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index cd1bb0db3..a856cde86 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -80,10 +80,12 @@ def test_write_documents(self, document_store: PineconeDocumentStore): assert document_store.write_documents(docs) == 1 @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): ... + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): ... + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... def test_init_fails_wo_api_key(self, monkeypatch): api_key = None diff --git a/integrations/pinecone/tests/test_dense_retriever.py b/integrations/pinecone/tests/test_emebedding_retriever.py similarity index 89% rename from integrations/pinecone/tests/test_dense_retriever.py rename to integrations/pinecone/tests/test_emebedding_retriever.py index e0f6dc375..ccb182679 100644 --- a/integrations/pinecone/tests/test_dense_retriever.py +++ b/integrations/pinecone/tests/test_emebedding_retriever.py @@ -5,13 +5,13 @@ from haystack.dataclasses import Document -from haystack_integrations.components.retrievers.pinecone import PineconeDenseRetriever +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever from haystack_integrations.document_stores.pinecone import PineconeDocumentStore def test_init_default(): mock_store = Mock(spec=PineconeDocumentStore) - retriever = PineconeDenseRetriever(document_store=mock_store) + retriever = PineconeEmbeddingRetriever(document_store=mock_store) assert retriever.document_store == mock_store assert retriever.filters == {} assert retriever.top_k == 10 @@ -28,10 +28,10 @@ def test_to_dict(mock_pinecone): batch_size=50, dimension=512, ) - retriever = PineconeDenseRetriever(document_store=document_store) + retriever = PineconeEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeDenseRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -52,7 +52,7 @@ def test_to_dict(mock_pinecone): @patch("haystack_integrations.document_stores.pinecone.document_store.pinecone") def test_from_dict(mock_pinecone, monkeypatch): data = { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeDenseRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -71,7 +71,7 @@ def test_from_dict(mock_pinecone, monkeypatch): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 512} monkeypatch.setenv("PINECONE_API_KEY", "test-key") - retriever = PineconeDenseRetriever.from_dict(data) + retriever = PineconeEmbeddingRetriever.from_dict(data) document_store = retriever.document_store assert document_store.environment == "gcp-starter" @@ -87,7 +87,7 @@ def test_from_dict(mock_pinecone, monkeypatch): def test_run(): mock_store = Mock(spec=PineconeDocumentStore) mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] - retriever = PineconeDenseRetriever(document_store=mock_store) + retriever = PineconeEmbeddingRetriever(document_store=mock_store) res = retriever.run(query_embedding=[0.5, 0.7]) mock_store._embedding_retrieval.assert_called_once_with( query_embedding=[0.5, 0.7], diff --git a/integrations/pinecone/tests/test_filters.py b/integrations/pinecone/tests/test_filters.py index 05796cf20..a38482a26 100644 --- a/integrations/pinecone/tests/test_filters.py +++ b/integrations/pinecone/tests/test_filters.py @@ -38,34 +38,45 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do assert received_doc.embedding == pytest.approx(expected_doc.embedding) @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_equal_with_none(self, document_store, filterable_docs): ... + def test_comparison_equal_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_not_equal_with_none(self, document_store, filterable_docs): ... + def test_comparison_not_equal_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): ... + def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_greater_than_with_none(self, document_store, filterable_docs): ... + def test_comparison_greater_than_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_greater_than_equal_with_iso_date(self, document_store, filterable_docs): ... + def test_comparison_greater_than_equal_with_iso_date(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_greater_than_equal_with_none(self, document_store, filterable_docs): ... + def test_comparison_greater_than_equal_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_less_than_with_iso_date(self, document_store, filterable_docs): ... + def test_comparison_less_than_with_iso_date(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_less_than_with_none(self, document_store, filterable_docs): ... + def test_comparison_less_than_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_less_than_equal_with_iso_date(self, document_store, filterable_docs): ... + def test_comparison_less_than_equal_with_iso_date(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_less_than_equal_with_none(self, document_store, filterable_docs): ... + def test_comparison_less_than_equal_with_none(self, document_store, filterable_docs): + ... @pytest.mark.skip(reason="Pinecone does not support the 'not' operator") - def test_not_operator(self, document_store, filterable_docs): ... + def test_not_operator(self, document_store, filterable_docs): + ... From a7943e2686a237dd44bb0f8d78e99a5e71fc1f69 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 12 Feb 2024 16:53:47 +0100 Subject: [PATCH 2/3] fix linting and test --- .../pinecone/tests/test_document_store.py | 6 ++-- .../tests/test_emebedding_retriever.py | 4 +-- integrations/pinecone/tests/test_filters.py | 33 +++++++------------ 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index a856cde86..cd1bb0db3 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -80,12 +80,10 @@ def test_write_documents(self, document_store: PineconeDocumentStore): assert document_store.write_documents(docs) == 1 @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): - ... + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): ... @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): - ... + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): ... def test_init_fails_wo_api_key(self, monkeypatch): api_key = None diff --git a/integrations/pinecone/tests/test_emebedding_retriever.py b/integrations/pinecone/tests/test_emebedding_retriever.py index ccb182679..d2d3c8546 100644 --- a/integrations/pinecone/tests/test_emebedding_retriever.py +++ b/integrations/pinecone/tests/test_emebedding_retriever.py @@ -31,7 +31,7 @@ def test_to_dict(mock_pinecone): retriever = PineconeEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeEmbeddingRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.embedding_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -52,7 +52,7 @@ def test_to_dict(mock_pinecone): @patch("haystack_integrations.document_stores.pinecone.document_store.pinecone") def test_from_dict(mock_pinecone, monkeypatch): data = { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeEmbeddingRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.embedding_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { diff --git a/integrations/pinecone/tests/test_filters.py b/integrations/pinecone/tests/test_filters.py index a38482a26..05796cf20 100644 --- a/integrations/pinecone/tests/test_filters.py +++ b/integrations/pinecone/tests/test_filters.py @@ -38,45 +38,34 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do assert received_doc.embedding == pytest.approx(expected_doc.embedding) @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_equal_with_none(self, document_store, filterable_docs): - ... + def test_comparison_equal_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_not_equal_with_none(self, document_store, filterable_docs): - ... + def test_comparison_not_equal_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): - ... + def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_greater_than_with_none(self, document_store, filterable_docs): - ... + def test_comparison_greater_than_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_greater_than_equal_with_iso_date(self, document_store, filterable_docs): - ... + def test_comparison_greater_than_equal_with_iso_date(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_greater_than_equal_with_none(self, document_store, filterable_docs): - ... + def test_comparison_greater_than_equal_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_less_than_with_iso_date(self, document_store, filterable_docs): - ... + def test_comparison_less_than_with_iso_date(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_less_than_with_none(self, document_store, filterable_docs): - ... + def test_comparison_less_than_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with dates") - def test_comparison_less_than_equal_with_iso_date(self, document_store, filterable_docs): - ... + def test_comparison_less_than_equal_with_iso_date(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support comparison with null values") - def test_comparison_less_than_equal_with_none(self, document_store, filterable_docs): - ... + def test_comparison_less_than_equal_with_none(self, document_store, filterable_docs): ... @pytest.mark.skip(reason="Pinecone does not support the 'not' operator") - def test_not_operator(self, document_store, filterable_docs): - ... + def test_not_operator(self, document_store, filterable_docs): ... From 56f7737b3f0721c3d0c4724dd3e74e2ae8269ab7 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 12 Feb 2024 17:04:42 +0100 Subject: [PATCH 3/3] fix pydoc config --- integrations/pinecone/pydoc/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/pydoc/config.yml b/integrations/pinecone/pydoc/config.yml index f2d6b338b..cef514987 100644 --- a/integrations/pinecone/pydoc/config.yml +++ b/integrations/pinecone/pydoc/config.yml @@ -3,7 +3,7 @@ loaders: search_path: [../src] modules: [ - "haystack_integrations.components.retrievers.pinecone.dense_retriever", + "haystack_integrations.components.retrievers.pinecone.embedding_retriever", "haystack_integrations.document_stores.pinecone.document_store", "haystack_integrations.document_stores.pinecone.errors", "haystack_integrations.document_stores.pinecone.filters",