From 2451f629741aeb42b0732ad767e253acd69162be Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Thu, 16 May 2024 19:13:17 -0400 Subject: [PATCH 1/5] hybrid retrieval ex --- .../pgvector/examples/hybrid_retrieval.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 integrations/pgvector/examples/hybrid_retrieval.py diff --git a/integrations/pgvector/examples/hybrid_retrieval.py b/integrations/pgvector/examples/hybrid_retrieval.py new file mode 100644 index 000000000..492d1b2e4 --- /dev/null +++ b/integrations/pgvector/examples/hybrid_retrieval.py @@ -0,0 +1,69 @@ +# Before running this example, ensure you have PostgreSQL installed with the pgvector extension. +# For a quick setup using Docker: +# docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres +# -e POSTGRES_DB=postgres ankane/pgvector + +# Install required packages for this example, including pgvector-haystack and other libraries needed +# for Markdown conversion and embeddings generation. Use the following command: +# pip install pgvector-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" + +# Download some Markdown files to index. +# git clone https://github.com/anakin87/neural-search-pills + +import glob + +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.joiners import DocumentJoiner +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever, PgvectorKeywordRetriever +from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore + +# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database. +# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" + +# Initialize PgvectorDocumentStore +document_store = PgvectorDocumentStore( + table_name="haystack_test", + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, + search_strategy="hnsw", +) + +# Create the indexing Pipeline and index some documents +file_paths = glob.glob("neural-search-pills/pills/*.md") + + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + +# Create the querying Pipeline and try a query +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store, top_k=3)) +querying.add_component("keyword_retriever", PgvectorKeywordRetriever(document_store=document_store, top_k=3)) +querying.add_component( + "joiner", + DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3), +) +querying.connect("embedder", "retriever") +querying.connect("keyword_retriever", "joiner") +querying.connect("retriever", "joiner") + +query = "What is a cross-encoder?" +results = querying.run({"text_embedder": {"text": query}, "bm25_retriever": {"query": query}}) + +for doc in results["joiner"]["documents"]: + print(doc) + print("-" * 10) From 3001b25d24c1b08d5bedea853098c89871d0d130 Mon Sep 17 00:00:00 2001 From: jlonge4 <91354480+jlonge4@users.noreply.github.com> Date: Fri, 17 May 2024 16:53:52 -0400 Subject: [PATCH 2/5] Update integrations/pgvector/examples/hybrid_retrieval.py Co-authored-by: Stefano Fiorucci --- integrations/pgvector/examples/hybrid_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pgvector/examples/hybrid_retrieval.py b/integrations/pgvector/examples/hybrid_retrieval.py index 492d1b2e4..efeecfb94 100644 --- a/integrations/pgvector/examples/hybrid_retrieval.py +++ b/integrations/pgvector/examples/hybrid_retrieval.py @@ -62,7 +62,7 @@ querying.connect("retriever", "joiner") query = "What is a cross-encoder?" -results = querying.run({"text_embedder": {"text": query}, "bm25_retriever": {"query": query}}) +results = querying.run({"text_embedder": {"text": query}, "keyword_retriever": {"query": query}}) for doc in results["joiner"]["documents"]: print(doc) From 53b5ac8260afaf336c6cdf2ad41e3ad1536e9a4d Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Fri, 17 May 2024 17:02:39 -0400 Subject: [PATCH 3/5] suggested updates --- integrations/pgvector/examples/hybrid_retrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/pgvector/examples/hybrid_retrieval.py b/integrations/pgvector/examples/hybrid_retrieval.py index efeecfb94..820eddf8b 100644 --- a/integrations/pgvector/examples/hybrid_retrieval.py +++ b/integrations/pgvector/examples/hybrid_retrieval.py @@ -57,11 +57,11 @@ "joiner", DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3), ) -querying.connect("embedder", "retriever") +querying.connect("text_embedder", "retriever") querying.connect("keyword_retriever", "joiner") querying.connect("retriever", "joiner") -query = "What is a cross-encoder?" +query = "cross-encoder" results = querying.run({"text_embedder": {"text": query}, "keyword_retriever": {"query": query}}) for doc in results["joiner"]["documents"]: From 14eac600f327a29f7c9901e80511496b514c4f2e Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Fri, 17 May 2024 17:05:04 -0400 Subject: [PATCH 4/5] suggested updates --- integrations/pgvector/examples/hybrid_retrieval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/pgvector/examples/hybrid_retrieval.py b/integrations/pgvector/examples/hybrid_retrieval.py index 820eddf8b..fa3a13177 100644 --- a/integrations/pgvector/examples/hybrid_retrieval.py +++ b/integrations/pgvector/examples/hybrid_retrieval.py @@ -40,17 +40,17 @@ indexing = Pipeline() indexing.add_component("converter", MarkdownToDocument()) indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) -indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder()) indexing.add_component("writer", DocumentWriter(document_store)) indexing.connect("converter", "splitter") indexing.connect("splitter", "embedder") -indexing.connect("embedder", "writer") +indexing.connect("document_embedder", "writer") indexing.run({"converter": {"sources": file_paths}}) # Create the querying Pipeline and try a query querying = Pipeline() -querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("text_embedder", SentenceTransformersTextEmbedder()) querying.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store, top_k=3)) querying.add_component("keyword_retriever", PgvectorKeywordRetriever(document_store=document_store, top_k=3)) querying.add_component( From 16ca897802430c5d885c4bd295e30ae9d76611a4 Mon Sep 17 00:00:00 2001 From: jlonge4 Date: Fri, 17 May 2024 17:05:23 -0400 Subject: [PATCH 5/5] suggested updates --- integrations/pgvector/examples/hybrid_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pgvector/examples/hybrid_retrieval.py b/integrations/pgvector/examples/hybrid_retrieval.py index fa3a13177..cee98fe08 100644 --- a/integrations/pgvector/examples/hybrid_retrieval.py +++ b/integrations/pgvector/examples/hybrid_retrieval.py @@ -43,7 +43,7 @@ indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder()) indexing.add_component("writer", DocumentWriter(document_store)) indexing.connect("converter", "splitter") -indexing.connect("splitter", "embedder") +indexing.connect("splitter", "document_embedder") indexing.connect("document_embedder", "writer") indexing.run({"converter": {"sources": file_paths}})