From c1ba30741652698f1aa455c5fa97597cd50fa262 Mon Sep 17 00:00:00 2001 From: itaybar <46649219+itaybar@users.noreply.github.com> Date: Sat, 30 Dec 2023 21:00:44 +0200 Subject: [PATCH 1/4] removed transformers dependency --- .env | 2 +- package.json | 1 - src/lib/server/websearch/runWebSearch.ts | 12 +-- .../server/websearch/sentenceSimilarity.ts | 100 +++++++++--------- 4 files changed, 56 insertions(+), 59 deletions(-) diff --git a/.env b/.env index 9e091c37a8f..4c7cddc45f6 100644 --- a/.env +++ b/.env @@ -105,7 +105,7 @@ PUBLIC_APP_COLOR=blue # can be any of tailwind colors: https://tailwindcss.com/d PUBLIC_APP_DESCRIPTION=# description used throughout the app (if not set, a default one will be used) PUBLIC_APP_DATA_SHARING=#set to 1 to enable options & text regarding data sharing PUBLIC_APP_DISCLAIMER=#set to 1 to show a disclaimer on login page -LLM_SUMMERIZATION=true +LLM_SUMMERIZATION=false # PUBLIC_APP_NAME=HuggingChat # PUBLIC_APP_ASSETS=huggingchat diff --git a/package.json b/package.json index 4fb71d007be..998987d9978 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,6 @@ "@huggingface/hub": "^0.5.1", "@huggingface/inference": "^2.6.3", "@iconify-json/bi": "^1.1.21", - "@xenova/transformers": "^2.6.0", "autoprefixer": "^10.4.14", "browser-image-resizer": "^2.4.1", "date-fns": "^2.29.3", diff --git a/src/lib/server/websearch/runWebSearch.ts b/src/lib/server/websearch/runWebSearch.ts index 0869ea8b494..0423877f040 100644 --- a/src/lib/server/websearch/runWebSearch.ts +++ b/src/lib/server/websearch/runWebSearch.ts @@ -4,10 +4,7 @@ import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch"; import { generateQuery } from "$lib/server/websearch/generateQuery"; import { parseWeb } from "$lib/server/websearch/parseWeb"; import { chunk } from "$lib/utils/chunk"; -import { - MAX_SEQ_LEN as CHUNK_CAR_LEN, - findSimilarSentences, -} from "$lib/server/websearch/sentenceSimilarity"; +import { MAX_SEQ_LEN as CHUNK_CAR_LEN } from "$lib/server/websearch/sentenceSimilarity"; import type { Conversation } from "$lib/types/Conversation"; import type { MessageUpdate } from "$lib/types/MessageUpdate"; import { getWebSearchProvider } from "./searchWeb"; @@ -87,9 +84,10 @@ export async function runWebSearch( appendUpdate("Extracting relevant information"); const topKClosestParagraphs = 8; const texts = paragraphChunks.map(({ text }) => text); - const indices = await findSimilarSentences(prompt, texts, { - topK: topKClosestParagraphs, - }); + const indices = []; + // const indices = await findSimilarSentences(prompt, texts, { + // topK: topKClosestParagraphs, + // }); webSearch.context = indices.map((idx) => texts[idx]).join(""); const usedSources = new Set(); diff --git a/src/lib/server/websearch/sentenceSimilarity.ts b/src/lib/server/websearch/sentenceSimilarity.ts index a877f8e0cd6..3c0088ae459 100644 --- a/src/lib/server/websearch/sentenceSimilarity.ts +++ b/src/lib/server/websearch/sentenceSimilarity.ts @@ -1,52 +1,52 @@ -import type { Tensor, Pipeline } from "@xenova/transformers"; -import { pipeline, dot } from "@xenova/transformers"; - -// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34 -function innerProduct(tensor1: Tensor, tensor2: Tensor) { - return 1.0 - dot(tensor1.data, tensor2.data); -} - -// Use the Singleton pattern to enable lazy construction of the pipeline. -class PipelineSingleton { - static modelId = "Xenova/gte-small"; - static instance: Promise | null = null; - static async getInstance() { - if (this.instance === null) { - this.instance = pipeline("feature-extraction", this.modelId); - } - return this.instance; - } -} - -// see https://huggingface.co/thenlper/gte-small/blob/d8e2604cadbeeda029847d19759d219e0ce2e6d8/README.md?code=true#L2625 +// import type { Tensor, Pipeline } from "@xenova/transformers"; +// import { pipeline, dot } from "@xenova/transformers"; + +// // see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34 +// function innerProduct(tensor1: Tensor, tensor2: Tensor) { +// return 1.0 - dot(tensor1.data, tensor2.data); +// } + +// // Use the Singleton pattern to enable lazy construction of the pipeline. +// class PipelineSingleton { +// static modelId = "Xenova/gte-small"; +// static instance: Promise | null = null; +// static async getInstance() { +// if (this.instance === null) { +// this.instance = pipeline("feature-extraction", this.modelId); +// } +// return this.instance; +// } +// } + +// // see https://huggingface.co/thenlper/gte-small/blob/d8e2604cadbeeda029847d19759d219e0ce2e6d8/README.md?code=true#L2625 export const MAX_SEQ_LEN = 512 as const; -export async function findSimilarSentences( - query: string, - sentences: string[], - { topK = 5 }: { topK: number } -) { - const input = [query, ...sentences]; - - const extractor = await PipelineSingleton.getInstance(); - const output: Tensor = await extractor(input, { pooling: "mean", normalize: true }); - - const queryTensor: Tensor = output[0]; - const sentencesTensor: Tensor = output.slice([1, input.length - 1]); - - const distancesFromQuery: { distance: number; index: number }[] = [...sentencesTensor].map( - (sentenceTensor: Tensor, index: number) => { - return { - distance: innerProduct(queryTensor, sentenceTensor), - index: index, - }; - } - ); - - distancesFromQuery.sort((a, b) => { - return a.distance - b.distance; - }); - - // Return the indexes of the closest topK sentences - return distancesFromQuery.slice(0, topK).map((item) => item.index); -} +// export async function findSimilarSentences( +// query: string, +// sentences: string[], +// { topK = 5 }: { topK: number } +// ) { +// const input = [query, ...sentences]; + +// const extractor = await PipelineSingleton.getInstance(); +// const output: Tensor = await extractor(input, { pooling: "mean", normalize: true }); + +// const queryTensor: Tensor = output[0]; +// const sentencesTensor: Tensor = output.slice([1, input.length - 1]); + +// const distancesFromQuery: { distance: number; index: number }[] = [...sentencesTensor].map( +// (sentenceTensor: Tensor, index: number) => { +// return { +// distance: innerProduct(queryTensor, sentenceTensor), +// index: index, +// }; +// } +// ); + +// distancesFromQuery.sort((a, b) => { +// return a.distance - b.distance; +// }); + +// // Return the indexes of the closest topK sentences +// return distancesFromQuery.slice(0, topK).map((item) => item.index); +// } From 2d02aabde8adfebb0f033f574b00618595dd7eab Mon Sep 17 00:00:00 2001 From: itaybar <46649219+itaybar@users.noreply.github.com> Date: Sat, 30 Dec 2023 21:20:23 +0200 Subject: [PATCH 2/4] format --- src/lib/server/websearch/runWebSearch.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/server/websearch/runWebSearch.ts b/src/lib/server/websearch/runWebSearch.ts index 0423877f040..0966bb351ae 100644 --- a/src/lib/server/websearch/runWebSearch.ts +++ b/src/lib/server/websearch/runWebSearch.ts @@ -82,7 +82,7 @@ export async function runWebSearch( } appendUpdate("Extracting relevant information"); - const topKClosestParagraphs = 8; + // const topKClosestParagraphs = 8; const texts = paragraphChunks.map(({ text }) => text); const indices = []; // const indices = await findSimilarSentences(prompt, texts, { From 658fd893e32aeb946bc00d8e69971e0e97dbdbd2 Mon Sep 17 00:00:00 2001 From: itaybar <46649219+itaybar@users.noreply.github.com> Date: Sat, 30 Dec 2023 21:28:39 +0200 Subject: [PATCH 3/4] format2 --- src/lib/server/websearch/runWebSearch.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/server/websearch/runWebSearch.ts b/src/lib/server/websearch/runWebSearch.ts index 0966bb351ae..ec1f14e6d65 100644 --- a/src/lib/server/websearch/runWebSearch.ts +++ b/src/lib/server/websearch/runWebSearch.ts @@ -84,7 +84,7 @@ export async function runWebSearch( appendUpdate("Extracting relevant information"); // const topKClosestParagraphs = 8; const texts = paragraphChunks.map(({ text }) => text); - const indices = []; + const indices: any[] = []; // const indices = await findSimilarSentences(prompt, texts, { // topK: topKClosestParagraphs, // }); From 1cac79e278d89fe175e58cf829630fce327c5453 Mon Sep 17 00:00:00 2001 From: itaybar <46649219+itaybar@users.noreply.github.com> Date: Sat, 30 Dec 2023 21:30:32 +0200 Subject: [PATCH 4/4] format --- src/lib/server/websearch/runWebSearch.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/server/websearch/runWebSearch.ts b/src/lib/server/websearch/runWebSearch.ts index ec1f14e6d65..e91b87fd7e9 100644 --- a/src/lib/server/websearch/runWebSearch.ts +++ b/src/lib/server/websearch/runWebSearch.ts @@ -84,6 +84,7 @@ export async function runWebSearch( appendUpdate("Extracting relevant information"); // const topKClosestParagraphs = 8; const texts = paragraphChunks.map(({ text }) => text); + // eslint-disable-next-line @typescript-eslint/no-explicit-any const indices: any[] = []; // const indices = await findSimilarSentences(prompt, texts, { // topK: topKClosestParagraphs,