From dd376f7127a41a1947ec578b2417d66215769bb5 Mon Sep 17 00:00:00 2001 From: anadi45 Date: Fri, 27 Dec 2024 20:08:57 +0530 Subject: [PATCH 1/5] feat: add mmr search to pgvector --- .../src/vectorstores/pgvector.ts | 67 +++++++++++++++++-- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts index 4b5e39b95827..0557274998bb 100644 --- a/libs/langchain-community/src/vectorstores/pgvector.ts +++ b/libs/langchain-community/src/vectorstores/pgvector.ts @@ -1,8 +1,12 @@ import pg, { type Pool, type PoolClient, type PoolConfig } from "pg"; -import { VectorStore } from "@langchain/core/vectorstores"; +import { + MaxMarginalRelevanceSearchOptions, + VectorStore, +} from "@langchain/core/vectorstores"; import type { EmbeddingsInterface } from "@langchain/core/embeddings"; import { Document } from "@langchain/core/documents"; import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { maximalMarginalRelevance } from "@langchain/core/utils/math"; type Metadata = Record; @@ -261,9 +265,15 @@ export class PGVectorStore extends VectorStore { this.chunkSize = config.chunkSize ?? 500; this.distanceStrategy = config.distanceStrategy ?? this.distanceStrategy; - this._verbose = - getEnvironmentVariable("LANGCHAIN_VERBOSE") === "true" ?? - !!config.verbose; + const langchainVerbose = getEnvironmentVariable("LANGCHAIN_VERBOSE"); + + if (langchainVerbose === "true") { + this._verbose = true; + } else if (langchainVerbose === "false") { + this._verbose = false; + } else { + this._verbose = config.verbose; + } } get computedTableName() { @@ -603,12 +613,14 @@ export class PGVectorStore extends VectorStore { * @param query - Query vector. * @param k - Number of most similar documents to return. * @param filter - Optional filter to apply to the search. + * @param includeEmbedding Whether to include the embedding vectors in the results. * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score. */ async similaritySearchVectorWithScore( query: number[], k: number, - filter?: this["FilterType"] + filter?: this["FilterType"], + includeEmbedding?: boolean ): Promise<[Document, number][]> { const embeddingString = `[${query.join(",")}]`; const _filter: this["FilterType"] = filter ?? {}; @@ -688,6 +700,9 @@ export class PGVectorStore extends VectorStore { metadata: doc[this.metadataColumnName], id: doc[this.idColumnName], }); + if (includeEmbedding) { + document.metadata[this.vectorColumnName] = doc[this.vectorColumnName]; + } results.push([document, doc._distance]); } } @@ -879,4 +894,46 @@ export class PGVectorStore extends VectorStore { ); } } + + /** + * Return documents selected using the maximal marginal relevance. + * Maximal marginal relevance optimizes for similarity to the query AND + * diversity among selected documents. + * @param query Text to look up documents similar to. + * @param options.k=4 Number of documents to return. + * @param options.fetchK=20 Number of documents to fetch before passing to + * the MMR algorithm. + * @param options.lambda=0.5 Number between 0 and 1 that determines the + * degree of diversity among the results, where 0 corresponds to maximum + * diversity and 1 to minimum diversity. + * @returns List of documents selected by maximal marginal relevance. + */ + async maxMarginalRelevanceSearch( + query: string, + options: MaxMarginalRelevanceSearchOptions + ): Promise { + const { k = 4, fetchK = 20, lambda = 0.5, filter } = options; + const queryEmbedding = await this.embeddings.embedQuery(query); + + const docs = await this.similaritySearchVectorWithScore( + queryEmbedding, + fetchK, + filter, + true + ); + + const embeddingList = docs.map( + (doc) => doc[0].metadata[this.vectorColumnName] + ); + + const mmrIndexes = maximalMarginalRelevance( + queryEmbedding, + embeddingList, + lambda, + k + ); + + const mmrDocs = mmrIndexes.map((index) => docs[index][0]); + return mmrDocs; + } } From 5edb746eb321a085062f67365f443c65202939b4 Mon Sep 17 00:00:00 2001 From: anadi45 Date: Fri, 27 Dec 2024 20:13:36 +0530 Subject: [PATCH 2/5] fix: parse array from string --- libs/langchain-community/src/vectorstores/pgvector.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts index 0557274998bb..de74f4f09163 100644 --- a/libs/langchain-community/src/vectorstores/pgvector.ts +++ b/libs/langchain-community/src/vectorstores/pgvector.ts @@ -922,8 +922,8 @@ export class PGVectorStore extends VectorStore { true ); - const embeddingList = docs.map( - (doc) => doc[0].metadata[this.vectorColumnName] + const embeddingList = docs.map((doc) => + JSON.parse(doc[0].metadata[this.vectorColumnName]) ); const mmrIndexes = maximalMarginalRelevance( From 0827785a3c6b8f36ea5ca9cfe350017d50a023e6 Mon Sep 17 00:00:00 2001 From: anadi45 Date: Tue, 31 Dec 2024 23:11:50 +0530 Subject: [PATCH 3/5] refactor: private method for similarity search --- .../src/vectorstores/pgvector.ts | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts index de74f4f09163..b41b714038b6 100644 --- a/libs/langchain-community/src/vectorstores/pgvector.ts +++ b/libs/langchain-community/src/vectorstores/pgvector.ts @@ -606,17 +606,14 @@ export class PGVectorStore extends VectorStore { } /** - * Method to perform a similarity search in the vector store. It returns - * the `k` most similar documents to the query vector, along with their - * similarity scores. - * + * Method to perform a similarity search in the vector store. It returns the `k` most similar documents to the query text. * @param query - Query vector. * @param k - Number of most similar documents to return. * @param filter - Optional filter to apply to the search. * @param includeEmbedding Whether to include the embedding vectors in the results. * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score. */ - async similaritySearchVectorWithScore( + private async searchPostgres( query: number[], k: number, filter?: this["FilterType"], @@ -709,6 +706,23 @@ export class PGVectorStore extends VectorStore { return results; } + /** + * Method to perform a similarity search in the vector store. It returns + * the `k` most similar documents to the query vector, along with their + * similarity scores. + * @param query - Query vector. + * @param k - Number of most similar documents to return. + * @param filter - Optional filter to apply to the search. + * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score. + */ + async similaritySearchVectorWithScore( + query: number[], + k: number, + filter?: this["FilterType"] + ): Promise<[Document, number][]> { + return this.searchPostgres(query, k, filter, false); + } + /** * Method to ensure the existence of the table in the database. It creates * the table if it does not already exist. @@ -915,7 +929,7 @@ export class PGVectorStore extends VectorStore { const { k = 4, fetchK = 20, lambda = 0.5, filter } = options; const queryEmbedding = await this.embeddings.embedQuery(query); - const docs = await this.similaritySearchVectorWithScore( + const docs = await this.searchPostgres( queryEmbedding, fetchK, filter, From f589d44ff8388fa89928f5e691de3e062c604325 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Thu, 2 Jan 2025 11:34:29 -0800 Subject: [PATCH 4/5] Add test --- .../tests/pgvector/docker-compose.yml | 5 ++-- .../tests/pgvector/pgvector.int.test.ts | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml b/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml index 306214d0984e..3be9e8b80e97 100644 --- a/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml +++ b/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml @@ -1,10 +1,9 @@ # Run this command to start the database: -# docker-compose up --build -version: "3" +# docker compose up --build services: db: hostname: 127.0.0.1 - image: ankane/pgvector + image: pgvector/pgvector:pg16 ports: - 5432:5432 restart: always diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts index 5f01d3012b25..828c6f57a2f2 100644 --- a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts @@ -73,6 +73,32 @@ describe("PGVectorStore", () => { expect(results).toHaveLength(1); expect(results[0].pageContent).toEqual("Cat drinks milk"); }); + + test.only("Test MMR search", async () => { + const documents = [ + { + pageContent: "hello", + metadata: { a: 1 }, + }, + { + pageContent: "Cat drinks milk", + metadata: { a: 2 }, + }, + { + pageContent: "foo", + metadata: { a: 2 }, + }, + { pageContent: "hi", metadata: { a: 1 } }, + ]; + await pgvectorVectorStore.addDocuments(documents); + const results = await pgvectorVectorStore.maxMarginalRelevanceSearch("milk", { + k: 2, + }); + + expect(results).toHaveLength(2); + expect(results[0].pageContent).toEqual("Cat drinks milk"); + expect(results[1].pageContent).toEqual("foo"); + }); test("PGvector can save documents with a list greater than default chunk size", async () => { // Extract the default chunk size and add one. From 55b0b3132419573947b72c78bc7b16b8f09a9e17 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Thu, 2 Jan 2025 12:30:39 -0800 Subject: [PATCH 5/5] Fix --- .../vectorstores/tests/pgvector/pgvector.int.test.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts index 828c6f57a2f2..db28c80d34bd 100644 --- a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts @@ -73,7 +73,7 @@ describe("PGVectorStore", () => { expect(results).toHaveLength(1); expect(results[0].pageContent).toEqual("Cat drinks milk"); }); - + test.only("Test MMR search", async () => { const documents = [ { @@ -91,9 +91,12 @@ describe("PGVectorStore", () => { { pageContent: "hi", metadata: { a: 1 } }, ]; await pgvectorVectorStore.addDocuments(documents); - const results = await pgvectorVectorStore.maxMarginalRelevanceSearch("milk", { - k: 2, - }); + const results = await pgvectorVectorStore.maxMarginalRelevanceSearch( + "milk", + { + k: 2, + } + ); expect(results).toHaveLength(2); expect(results[0].pageContent).toEqual("Cat drinks milk");