From 0c79483ad28b979ec738ded60db1d5ae0e2153e4 Mon Sep 17 00:00:00 2001 From: axuj <68978293+axuj@users.noreply.github.com> Date: Tue, 10 Dec 2024 08:08:52 +0800 Subject: [PATCH] feat(community): update embedding jina (#7292) Co-authored-by: Jacob Lee --- .../docs/integrations/text_embedding/jina.mdx | 36 ++- .../src/embeddings/jina.ts | 276 ++++++++++-------- .../src/embeddings/tests/jina.int.test.ts | 6 +- .../src/utils/local_image_to_base64.ts | 7 + 4 files changed, 197 insertions(+), 128 deletions(-) create mode 100644 libs/langchain-community/src/utils/local_image_to_base64.ts diff --git a/docs/core_docs/docs/integrations/text_embedding/jina.mdx b/docs/core_docs/docs/integrations/text_embedding/jina.mdx index e44100a91856..c03396d1288d 100644 --- a/docs/core_docs/docs/integrations/text_embedding/jina.mdx +++ b/docs/core_docs/docs/integrations/text_embedding/jina.mdx @@ -34,12 +34,12 @@ Here’s how to create an instance of `JinaEmbeddings`: import { JinaEmbeddings } from "@langchain/community/embeddings/jina"; const embeddings = new JinaEmbeddings({ - apiToken: "YOUR_API_TOKEN", - model: "jina-embeddings-v2-base-en", // Optional, defaults to "jina-embeddings-v2-base-en" + apiKey: "YOUR_API_TOKEN", + model: "jina-clip-v2", // Optional, defaults to "jina-clip-v2" }); ``` -If the `apiToken` is not provided, it will be read from the `JINA_API_KEY` environment variable. +If the `apiKey` is not provided, it will be read from the `JINA_API_KEY` environment variable. ## Generating Embeddings @@ -59,10 +59,18 @@ console.log(embedding); To generate embeddings for multiple documents, use the `embedDocuments` method. ```typescript +import { localImageToBase64 } from "@langchain/community/utils/local_image_to_base64"; const documents = [ - "Document 1 text...", - "Document 2 text...", - "Document 3 text...", + "hello", + { + text: "hello", + }, + { + image: "https://i.ibb.co/nQNGqL0/beach1.jpg", + }, + { + image: await localImageToBase64("beach1.jpg"), + }, ]; const embeddingsArray = await embeddings.embedDocuments(documents); @@ -87,9 +95,10 @@ Here’s a complete example of how to set up and use the `JinaEmbeddings` class: ```typescript import { JinaEmbeddings } from "@langchain/community/embeddings/jina"; +import { localImageToBase64 } from "@langchain/community/embeddings/jina/util"; const embeddings = new JinaEmbeddings({ - apiToken: "YOUR_API_TOKEN", + apiKey: "YOUR_API_TOKEN", model: "jina-embeddings-v2-base-en", }); @@ -97,7 +106,18 @@ async function runExample() { const queryEmbedding = await embeddings.embedQuery("Example query text."); console.log("Query Embedding:", queryEmbedding); - const documents = ["Text 1", "Text 2", "Text 3"]; + const documents = [ + "hello", + { + text: "hello", + }, + { + image: "https://i.ibb.co/nQNGqL0/beach1.jpg", + }, + { + image: await localImageToBase64("beach1.jpg"), + }, + ]; const documentEmbeddings = await embeddings.embedDocuments(documents); console.log("Document Embeddings:", documentEmbeddings); } diff --git a/libs/langchain-community/src/embeddings/jina.ts b/libs/langchain-community/src/embeddings/jina.ts index a92d2077d658..870e3a3b24fb 100644 --- a/libs/langchain-community/src/embeddings/jina.ts +++ b/libs/langchain-community/src/embeddings/jina.ts @@ -1,94 +1,113 @@ -import { existsSync, readFileSync } from "fs"; -import { parse } from "url"; -import { Embeddings, EmbeddingsParams } from "@langchain/core/embeddings"; +import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings"; +import { chunkArray } from "@langchain/core/utils/chunk_array"; import { getEnvironmentVariable } from "@langchain/core/utils/env"; -/** - * The default Jina API URL for embedding requests. - */ -const JINA_API_URL = "https://api.jina.ai/v1/embeddings"; - -/** - * Check if a URL is a local file. - * @param url - The URL to check. - * @returns True if the URL is a local file, False otherwise. - */ -function isLocal(url: string): boolean { - const urlParsed = parse(url); - if (urlParsed.protocol === null || urlParsed.protocol === "file:") { - return existsSync(urlParsed.pathname || ""); - } - return false; -} +export interface JinaEmbeddingsParams extends EmbeddingsParams { + /** Model name to use */ + model: + | "jina-clip-v2" + | "jina-embeddings-v3" + | "jina-colbert-v2" + | "jina-clip-v1" + | "jina-colbert-v1-en" + | "jina-embeddings-v2-base-es" + | "jina-embeddings-v2-base-code" + | "jina-embeddings-v2-base-de" + | "jina-embeddings-v2-base-zh" + | "jina-embeddings-v2-base-en" + | string; + + baseUrl?: string; -/** - * Get the bytes string of a file. - * @param filePath - The path to the file. - * @returns The bytes string of the file. - */ -function getBytesStr(filePath: string): string { - const imageFile = readFileSync(filePath); - return Buffer.from(imageFile).toString("base64"); -} + /** + * Timeout to use when making requests to Jina. + */ + timeout?: number; -/** - * Input parameters for the Jina embeddings - */ -export interface JinaEmbeddingsParams extends EmbeddingsParams { /** - * The API key to use for authentication. - * If not provided, it will be read from the `JINA_API_KEY` environment variable. + * The maximum number of documents to embed in a single request. */ - apiKey?: string; + batchSize?: number; /** - * The model ID to use for generating embeddings. - * Default: `jina-embeddings-v2-base-en` + * Whether to strip new lines from the input text. */ - model?: string; -} + stripNewLines?: boolean; -/** - * Response from the Jina embeddings API. - */ -export interface JinaEmbeddingsResponse { /** - * The embeddings generated for the input texts. + * The dimensions of the embedding. */ - data: { index: number; embedding: number[] }[]; + dimensions?: number; /** - * The detail of the response e.g usage, model used etc. + * Scales the embedding so its Euclidean (L2) norm becomes 1, preserving direction. Useful when downstream involves dot-product, classification, visualization.. */ - detail?: string; + normalized?: boolean; } -/** - * A class for generating embeddings using the Jina API. - * @example - * ```typescript - * // Embed a query using the JinaEmbeddings class - * const model = new JinaEmbeddings(); - * const res = await model.embedQuery( - * "What would be a good name for a semantic search engine ?", - * ); - * console.log({ res }); - * ``` - */ -export class JinaEmbeddings extends Embeddings implements JinaEmbeddingsParams { - apiKey: string; +type JinaMultiModelInput = + | { + text: string; + image?: never; + } + | { + image: string; + text?: never; + }; - model: string; +export type JinaEmbeddingsInput = string | JinaMultiModelInput; + +interface EmbeddingCreateParams { + model: JinaEmbeddingsParams["model"]; /** - * Constructor for the JinaEmbeddings class. - * @param fields - An optional object with properties to configure the instance. + * input can be strings or JinaMultiModelInputs,if you want embed image,you should use JinaMultiModelInputs */ - constructor(fields?: Partial & { verbose?: boolean }) { - const fieldsWithDefaults = { - model: "jina-embeddings-v2-base-en", - ...fields, - }; + input: JinaEmbeddingsInput[]; + dimensions: number; + task: "retrieval.query" | "retrieval.passage"; + normalized?: boolean; +} + +interface EmbeddingResponse { + model: string; + object: string; + usage: { + total_tokens: number; + prompt_tokens: number; + }; + data: { + object: string; + index: number; + embedding: number[]; + }[]; +} + +interface EmbeddingErrorResponse { + detail: string; +} + +export class JinaEmbeddings extends Embeddings implements JinaEmbeddingsParams { + model: JinaEmbeddingsParams["model"] = "jina-clip-v2"; + + batchSize = 24; + + baseUrl = "https://api.jina.ai/v1/embeddings"; + + stripNewLines = true; + + dimensions = 1024; + + apiKey: string; + + normalized = true; + + constructor( + fields?: Partial & { + apiKey?: string; + } + ) { + const fieldsWithDefaults = { maxConcurrency: 2, ...fields }; super(fieldsWithDefaults); const apiKey = @@ -96,67 +115,90 @@ export class JinaEmbeddings extends Embeddings implements JinaEmbeddingsParams { getEnvironmentVariable("JINA_API_KEY") || getEnvironmentVariable("JINA_AUTH_TOKEN"); - if (!apiKey) { - throw new Error("Jina API key not found"); - } + if (!apiKey) throw new Error("Jina API key not found"); - this.model = fieldsWithDefaults?.model ?? this.model; this.apiKey = apiKey; + + this.model = fieldsWithDefaults?.model ?? this.model; + this.dimensions = fieldsWithDefaults?.dimensions ?? this.dimensions; + this.batchSize = fieldsWithDefaults?.batchSize ?? this.batchSize; + this.stripNewLines = + fieldsWithDefaults?.stripNewLines ?? this.stripNewLines; + this.normalized = fieldsWithDefaults?.normalized ?? this.normalized; } - /** - * Generates embeddings for an array of inputs. - * @param input - An array of strings or objects to generate embeddings for. - * @returns A Promise that resolves to an array of embeddings. - */ - // eslint-disable-next-line @typescript-eslint/no-explicit-any - private async _embed(input: any): Promise { - const response = await fetch(JINA_API_URL, { - method: "POST", - headers: { - Authorization: `Bearer ${this.apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ input, model: this.model }), + private doStripNewLines(input: JinaEmbeddingsInput[]) { + if (this.stripNewLines) { + return input.map((i) => { + if (typeof i === "string") { + return i.replace(/\n/g, " "); + } + if (i.text) { + return { text: i.text.replace(/\n/g, " ") }; + } + return i; + }); + } + return input; + } + + async embedDocuments(input: JinaEmbeddingsInput[]): Promise { + const batches = chunkArray(this.doStripNewLines(input), this.batchSize); + const batchRequests = batches.map((batch) => { + const params = this.getParams(batch); + return this.embeddingWithRetry(params); }); - const json = (await response.json()) as JinaEmbeddingsResponse; + const batchResponses = await Promise.all(batchRequests); + const embeddings: number[][] = []; - if (!json.data) { - throw new Error(json.detail || "Unknown error from Jina API"); + for (let i = 0; i < batchResponses.length; i += 1) { + const batch = batches[i]; + const batchResponse = batchResponses[i] || []; + for (let j = 0; j < batch.length; j += 1) { + embeddings.push(batchResponse[j]); + } } - const sortedEmbeddings = json.data.sort((a, b) => a.index - b.index); - - return sortedEmbeddings.map((item) => item.embedding); + return embeddings; } - /** - * Generates embeddings for an array of texts. - * @param texts - An array of strings to generate embeddings for. - * @returns A Promise that resolves to an array of embeddings. - */ - async embedDocuments(texts: string[]): Promise { - return this._embed(texts); - } + async embedQuery(input: JinaEmbeddingsInput): Promise { + const params = this.getParams(this.doStripNewLines([input]), true); - /** - * Generates an embedding for a single text. - * @param text - A string to generate an embedding for. - * @returns A Promise that resolves to an array of numbers representing the embedding. - */ - async embedQuery(text: string): Promise { - const embeddings = await this._embed([text]); + const embeddings = (await this.embeddingWithRetry(params)) || [[]]; return embeddings[0]; } - /** - * Generates embeddings for an array of image URIs. - * @param uris - An array of image URIs to generate embeddings for. - * @returns A Promise that resolves to an array of embeddings. - */ - async embedImages(uris: string[]): Promise { - const input = uris.map((uri) => (isLocal(uri) ? getBytesStr(uri) : uri)); - return this._embed(input); + private getParams( + input: JinaEmbeddingsInput[], + query?: boolean + ): EmbeddingCreateParams { + return { + model: this.model, + input, + dimensions: this.dimensions, + task: query ? "retrieval.query" : "retrieval.passage", + normalized: this.normalized, + }; + } + + private async embeddingWithRetry(body: EmbeddingCreateParams) { + const response = await fetch(this.baseUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + const embeddingData: EmbeddingResponse | EmbeddingErrorResponse = + await response.json(); + if ("detail" in embeddingData && embeddingData.detail) { + throw new Error(`${embeddingData.detail}`); + } + return (embeddingData as EmbeddingResponse).data.map( + ({ embedding }) => embedding + ); } } diff --git a/libs/langchain-community/src/embeddings/tests/jina.int.test.ts b/libs/langchain-community/src/embeddings/tests/jina.int.test.ts index 4c9859cbb589..24396960b599 100644 --- a/libs/langchain-community/src/embeddings/tests/jina.int.test.ts +++ b/libs/langchain-community/src/embeddings/tests/jina.int.test.ts @@ -31,10 +31,10 @@ test("Test JinaEmbeddings concurrency", async () => { ); }); -test("Test JinaEmbeddings.embedImages", async () => { +test("Test JinaEmbeddings embedImages", async () => { const embeddings = new JinaEmbeddings(); - const res = await embeddings.embedImages([ - "https://avatars.githubusercontent.com/u/126733545?v=4", + const res = await embeddings.embedDocuments([ + { image: "https://avatars.githubusercontent.com/u/126733545?v=4" }, ]); expect(typeof res[0][0]).toBe("number"); }); diff --git a/libs/langchain-community/src/utils/local_image_to_base64.ts b/libs/langchain-community/src/utils/local_image_to_base64.ts new file mode 100644 index 000000000000..50da1af1ffb1 --- /dev/null +++ b/libs/langchain-community/src/utils/local_image_to_base64.ts @@ -0,0 +1,7 @@ +import { Buffer } from "node:buffer"; +import fs from "node:fs/promises"; + +export async function localImageToBase64(filePath: string): Promise { + const data = await fs.readFile(filePath); + return Buffer.from(data).toString("base64"); +}