From dd2613b82804d5a8ca8b3c50ae8d763f39680b40 Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Wed, 20 Mar 2024 02:56:48 +0530 Subject: [PATCH] Update example and description (#210) --- integrations/voyage.md | 50 ++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/integrations/voyage.md b/integrations/voyage.md index 6ee1c13d..c8ec3cee 100644 --- a/integrations/voyage.md +++ b/integrations/voyage.md @@ -3,11 +3,11 @@ layout: integration name: Voyage AI description: A component for computing embeddings using Voyage AI embedding models - built for Haystack 2.0. authors: - - name: Ashwin Mathur - socials: - github: awinml - twitter: awinml - linkedin: ashwin-mathur-ds + - name: Ashwin Mathur + socials: + github: awinml + twitter: awinml + linkedin: ashwin-mathur-ds pypi: https://pypi.org/project/voyage-embedders-haystack/ repo: https://github.com/awinml/voyage-embedders-haystack/tree/main type: Model Provider @@ -17,8 +17,9 @@ version: Haystack 2.0 toc: true --- -[![PyPI](https://img.shields.io/pypi/v/voyage-embedders-haystack)](https://pypi.org/project/voyage-embedders-haystack/) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/voyage-embedders-haystack?logo=python&logoColor=gold) +[![PyPI](https://img.shields.io/pypi/v/voyage-embedders-haystack)](https://pypi.org/project/voyage-embedders-haystack/) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/voyage-embedders-haystack?logo=python&logoColor=gold) + ### **Table of Contents** - [Installation](#installation) @@ -27,8 +28,7 @@ toc: true Custom component for [Haystack](https://github.com/deepset-ai/haystack) (2.x) for creating embeddings using the [VoyageAI Embedding Models](https://voyageai.com/). -Voyage’s embedding models, `voyage-01` and `voyage-lite-01`, are state-of-the-art in retrieval accuracy. These models outperform top performing embedding models like `BAAI-bge` and `OpenAI text-embedding-ada-002` on the [MTEB Benchmark](https://github.com/embeddings-benchmark/mteb). - +Voyage’s embedding models, `voyage-2` and `voyage-2-code`, are state-of-the-art in retrieval accuracy. These models outperform top performing embedding models like `intfloat/e5-mistral-7b-instruct` and `OpenAI/text-embedding-3-large` on the [MTEB Benchmark](https://github.com/embeddings-benchmark/mteb). `voyage-2` is current ranked second on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). ## Installation @@ -40,17 +40,18 @@ pip install voyage-embedders-haystack You can use Voyage Embedding models with two components: [VoyageTextEmbedder](https://github.com/awinml/voyage-embedders-haystack/blob/main/src/voyage_embedders/voyage_text_embedder.py) and [VoyageDocumentEmbedder](https://github.com/awinml/voyage-embedders-haystack/blob/main/src/voyage_embedders/voyage_document_embedder.py). -To create semantic embeddings for documents, use `VoyageDocumentEmbedder` in your indexing pipeline. For generating embeddings for queries, use `VoyageTextEmbedder`. Once you've selected the suitable component for your specific use case, initialize the component with the model name and Voyage AI API key. You can also -set the environment variable "VOYAGE_API_KEY" instead of passing the api key as an argument. +To create semantic embeddings for documents, use `VoyageDocumentEmbedder` in your indexing pipeline. For generating embeddings for queries, use `VoyageTextEmbedder`. + +Once you've selected the suitable component for your specific use case, initialize the component with the model name and VoyageAI API key. You can also +set the environment variable `VOYAGE_API_KEY` instead of passing the API key as an argument. Information about the supported models, can be found on the [Embeddings Documentation.](https://docs.voyageai.com/embeddings/) To get an API key, please see the [Voyage AI website.](https://www.voyageai.com/) - ## Example -Below is the example Semantic Search pipeline that uses the [Simple Wikipedia](https://huggingface.co/datasets/pszemraj/simple_wikipedia) Dataset from HuggingFace. You can find more examples in the [`examples`](https://github.com/awinml/voyage-embedders-haystack/tree/main/examples) folder. +Below is the example Semantic Search pipeline that uses the [Simple Wikipedia](https://huggingface.co/datasets/pszemraj/simple_wikipedia) Dataset from HuggingFace. You can find more examples in the [`examples`](https://github.com/awinml/voyage-embedders-haystack/tree/main/examples) folder. Load the dataset: @@ -64,8 +65,7 @@ from haystack.dataclasses import Document from haystack.document_stores.in_memory import InMemoryDocumentStore # Import Voyage Embedders -from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder -from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder +from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder # Load first 100 rows of the Simple Wikipedia Dataset from HuggingFace dataset = load_dataset("pszemraj/simple_wikipedia", split="validation[:100]") @@ -86,17 +86,19 @@ Index the documents to the `InMemoryDocumentStore` using the `VoyageDocumentEmbe ```python doc_store = InMemoryDocumentStore(embedding_similarity_function="cosine") +retriever = InMemoryEmbeddingRetriever(document_store=doc_store) +doc_writer = DocumentWriter(document_store=doc_store) + doc_embedder = VoyageDocumentEmbedder( - model_name="voyage-01", + model="voyage-2", input_type="document", - batch_size=8, - api_key="VOYAGE_API_KEY", ) +text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query") # Indexing Pipeline indexing_pipeline = Pipeline() indexing_pipeline.add_component(instance=doc_embedder, name="DocEmbedder") -indexing_pipeline.add_component(instance=DocumentWriter(document_store=doc_store), name="DocWriter") +indexing_pipeline.add_component(instance=doc_writer, name="DocWriter") indexing_pipeline.connect("DocEmbedder", "DocWriter") indexing_pipeline.run({"DocEmbedder": {"documents": docs}}) @@ -107,15 +109,15 @@ print(f"Embedding of first Document: {doc_store.filter_documents()[0].embedding} ``` Query the Semantic Search Pipeline using the `InMemoryEmbeddingRetriever` and `VoyageTextEmbedder`: + ```python -text_embedder = VoyageTextEmbedder(model_name="voyage-01", input_type="query", api_key="VOYAGE_API_KEY") +text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query") # Query Pipeline query_pipeline = Pipeline() -query_pipeline.add_component("TextEmbedder", text_embedder) -query_pipeline.add_component("Retriever", InMemoryEmbeddingRetriever(document_store=doc_store)) -query_pipeline.connect("TextEmbedder", "Retriever") - +query_pipeline.add_component(instance=text_embedder, name="TextEmbedder") +query_pipeline.add_component(instance=retriever, name="Retriever") +query_pipeline.connect("TextEmbedder.embedding", "Retriever.query_embedding") # Search results = query_pipeline.run({"TextEmbedder": {"text": "Which year did the Joker movie release?"}})