diff --git a/integrations/pinecone/examples/example.py b/integrations/pinecone/examples/example.py new file mode 100644 index 000000000..b2a534452 --- /dev/null +++ b/integrations/pinecone/examples/example.py @@ -0,0 +1,50 @@ +# Install the Pinecone integration, Haystack will come as a dependency +# Install also some optional dependencies needed for Markdown conversion and text embedding +# pip install -U pinecone-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" + +# Download some markdown files to index +# git clone https://github.com/anakin87/neural-search-pills + + +# Create the indexing Pipeline and index some documents + +import glob + +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from pinecone_haystack import PineconeDocumentStore +from pinecone_haystack.dense_retriever import PineconeEmbeddingRetriever + +file_paths = glob.glob("neural-search-pills/pills/*.md") + +document_store = PineconeDocumentStore( + api_key="YOUR-PINECONE-API-KEY", environment="gcp-starter", index="default", namespace="default", dimension=768 +) + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + + +# Create the querying Pipeline and try a query + +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store, top_k=3)) +querying.connect("embedder", "retriever") + +results = querying.run({"embedder": {"text": "What is Question Answering?"}}) + +for doc in results["retriever"]["documents"]: + print(doc) + print("-" * 10) diff --git a/integrations/pinecone/examples/pinecone_documentstore_example.ipynb b/integrations/pinecone/examples/pinecone_documentstore_example.ipynb deleted file mode 100644 index 70849836b..000000000 --- a/integrations/pinecone/examples/pinecone_documentstore_example.ipynb +++ /dev/null @@ -1,1298 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TFIFYm41SS8C" - }, - "outputs": [], - "source": [ - "# Install the Pinecone integration, Haystack will come as a dependency\n", - "# Install also some optional dependencies needed for Markdown conversion and text embedding\n", - "!pip install -U pinecone-haystack markdown-it-py mdit_plain \"sentence-transformers>=2.2.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4sTKYa3qbRAi", - "outputId": "7e8dcbb9-b330-4dab-8516-0caf0520b315" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cloning into 'neural-search-pills'...\n", - "remote: Enumerating objects: 190, done.\u001b[K\n", - "remote: Counting objects: 100% (190/190), done.\u001b[K\n", - "remote: Compressing objects: 100% (136/136), done.\u001b[K\n", - "remote: Total 190 (delta 97), reused 130 (delta 51), pack-reused 0\u001b[K\n", - "Receiving objects: 100% (190/190), 1.38 MiB | 21.77 MiB/s, done.\n", - "Resolving deltas: 100% (97/97), done.\n" - ] - } - ], - "source": [ - "# Download some markdown files to index\n", - "!git clone https://github.com/anakin87/neural-search-pills" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 153, - "referenced_widgets": [ - "c46e6d0699ff4fbcb8012b7076f5017d", - "5d94db6d00fb4d1abfbaa02f79bdeb78", - "98fce152d7cd45408bd5151574908310", - "3589c7ca39324b359c2378b944974669", - "b7d283c62ebc4199bc2cd6e3bfcb94bc", - "e706a6825bfe4735a0b83eb0dba20286", - "653f00da627740c39b740cd1bdb9e5ff", - "4fbcba61b6724cfc9c174a4b5c4fdf01", - "ed89a5c30ca24d45af85a8a5a61fae04", - "1a423d196704471481ed04a31e22e428", - "c641f6f19bd24af7874805097002dace", - "da90aa6e98de4e80a38a205d40695dcc", - "522ddaca7d26497d8e84a9be785f2222", - "a4c7565d2e5a4ac0aaa9bef88d8cc25f", - "efbd19a1674f4a6b8322ecf1a2e11591", - "fef14c506fef4ba5bc0c65e541e918b6", - "e2fe8960681b4c918eaedc6952e2534f", - "4d37fc2fea964cd590422097ab9a44bd", - "b2f99035deec45b187fe85813fb5bef4", - "e2238770d45047e998719cfb0ced7f2b", - "444ae4a1444b4f74ab45ca56aa8da8dc", - "9f22449bf9ba4ca0be6ad93656fed1d7" - ] - }, - "id": "CacG5dsTTT8B", - "outputId": "afac15af-7b51-4d7c-a301-004cbf333e9a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Converting markdown files to Documents: 100%|██████████| 14/14 [00:00<00:00, 163.72it/s]\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c46e6d0699ff4fbcb8012b7076f5017d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/5 [00:00 Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "PineconeDenseRetriever": + def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever": data["init_parameters"]["document_store"] = default_from_dict( PineconeDocumentStore, data["init_parameters"]["document_store"] ) diff --git a/integrations/pinecone/tests/test_dense_retriever.py b/integrations/pinecone/tests/test_emebedding_retriever.py similarity index 89% rename from integrations/pinecone/tests/test_dense_retriever.py rename to integrations/pinecone/tests/test_emebedding_retriever.py index e0f6dc375..d2d3c8546 100644 --- a/integrations/pinecone/tests/test_dense_retriever.py +++ b/integrations/pinecone/tests/test_emebedding_retriever.py @@ -5,13 +5,13 @@ from haystack.dataclasses import Document -from haystack_integrations.components.retrievers.pinecone import PineconeDenseRetriever +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever from haystack_integrations.document_stores.pinecone import PineconeDocumentStore def test_init_default(): mock_store = Mock(spec=PineconeDocumentStore) - retriever = PineconeDenseRetriever(document_store=mock_store) + retriever = PineconeEmbeddingRetriever(document_store=mock_store) assert retriever.document_store == mock_store assert retriever.filters == {} assert retriever.top_k == 10 @@ -28,10 +28,10 @@ def test_to_dict(mock_pinecone): batch_size=50, dimension=512, ) - retriever = PineconeDenseRetriever(document_store=document_store) + retriever = PineconeEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeDenseRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.embedding_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -52,7 +52,7 @@ def test_to_dict(mock_pinecone): @patch("haystack_integrations.document_stores.pinecone.document_store.pinecone") def test_from_dict(mock_pinecone, monkeypatch): data = { - "type": "haystack_integrations.components.retrievers.pinecone.dense_retriever.PineconeDenseRetriever", + "type": "haystack_integrations.components.retrievers.pinecone.embedding_retriever.PineconeEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -71,7 +71,7 @@ def test_from_dict(mock_pinecone, monkeypatch): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 512} monkeypatch.setenv("PINECONE_API_KEY", "test-key") - retriever = PineconeDenseRetriever.from_dict(data) + retriever = PineconeEmbeddingRetriever.from_dict(data) document_store = retriever.document_store assert document_store.environment == "gcp-starter" @@ -87,7 +87,7 @@ def test_from_dict(mock_pinecone, monkeypatch): def test_run(): mock_store = Mock(spec=PineconeDocumentStore) mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] - retriever = PineconeDenseRetriever(document_store=mock_store) + retriever = PineconeEmbeddingRetriever(document_store=mock_store) res = retriever.run(query_embedding=[0.5, 0.7]) mock_store._embedding_retrieval.assert_called_once_with( query_embedding=[0.5, 0.7],