From ab3d1a3e2d439ef0c558bd883736fca7d33059ce Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 16 Apr 2024 17:49:26 +0100 Subject: [PATCH] Qdrant: add embedding retrieval example --- .../qdrant/examples/embedding_retrieval.py | 52 +++++++++++++++++++ integrations/qdrant/pyproject.toml | 2 + 2 files changed, 54 insertions(+) create mode 100644 integrations/qdrant/examples/embedding_retrieval.py diff --git a/integrations/qdrant/examples/embedding_retrieval.py b/integrations/qdrant/examples/embedding_retrieval.py new file mode 100644 index 000000000..f009191e7 --- /dev/null +++ b/integrations/qdrant/examples/embedding_retrieval.py @@ -0,0 +1,52 @@ +# Install required packages for this example, including qdrant-haystack and other libraries needed +# for Markdown conversion and embeddings generation. Use the following command: +# pip install qdrant-haystack markdown-it-py mdit_plain sentence-transformers + +# Download some Markdown files to index. +# git clone https://github.com/anakin87/neural-search-pills + +import glob + +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever +from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + +# Initialize QdrantDocumentStore: for simplicity, we use an in-memory store here. +# You can also run a Qdrant instance using Docker or use Qdrant Cloud. +document_store = QdrantDocumentStore( + ":memory:", + index="Document", + embedding_dim=768, + recreate_index=True, +) + +# Create the indexing Pipeline and index some documents +file_paths = glob.glob("neural-search-pills/pills/*.md") + + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + +# Create the querying Pipeline and try a query +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("retriever", QdrantEmbeddingRetriever(document_store=document_store, top_k=3)) +querying.connect("embedder", "retriever") + +results = querying.run({"embedder": {"text": "What is a cross-encoder?"}}) + +for doc in results["retriever"]["documents"]: + print(doc) + print("-" * 10) diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index a566de955..fc1ea7b7e 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -127,6 +127,8 @@ ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] [tool.coverage.run]