From 1200ec95d0bca9f0dd463b03e40dd9b0b5bb5b8f Mon Sep 17 00:00:00 2001 From: Chooooo Date: Tue, 24 Dec 2024 10:34:59 +0900 Subject: [PATCH 1/7] feat(community): Add TTL support to UpstashRedisCache (#7422) --- .../src/cache/chat_models/upstash_redis.ts | 1 + examples/src/cache/upstash_redis.ts | 1 + .../src/caches/upstash_redis.ts | 21 ++++++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/examples/src/cache/chat_models/upstash_redis.ts b/examples/src/cache/chat_models/upstash_redis.ts index 9b8f2b012af6..a2ff3410c30b 100644 --- a/examples/src/cache/chat_models/upstash_redis.ts +++ b/examples/src/cache/chat_models/upstash_redis.ts @@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({ url: "UPSTASH_REDIS_REST_URL", token: "UPSTASH_REDIS_REST_TOKEN", }, + ttl: 3600, }); const model = new ChatOpenAI({ cache }); diff --git a/examples/src/cache/upstash_redis.ts b/examples/src/cache/upstash_redis.ts index e19f0d37908f..e91933104365 100644 --- a/examples/src/cache/upstash_redis.ts +++ b/examples/src/cache/upstash_redis.ts @@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({ url: "UPSTASH_REDIS_REST_URL", token: "UPSTASH_REDIS_REST_TOKEN", }, + ttl: 3600, }); const model = new OpenAI({ cache }); diff --git a/libs/langchain-community/src/caches/upstash_redis.ts b/libs/langchain-community/src/caches/upstash_redis.ts index 1cf89e82c826..fcb7e4a4343c 100644 --- a/libs/langchain-community/src/caches/upstash_redis.ts +++ b/libs/langchain-community/src/caches/upstash_redis.ts @@ -18,6 +18,10 @@ export type UpstashRedisCacheProps = { * An existing Upstash Redis client. */ client?: Redis; + /** + * Time-to-live (TTL) for cached items in seconds. + */ + ttl?: number; }; /** @@ -30,6 +34,7 @@ export type UpstashRedisCacheProps = { * url: "UPSTASH_REDIS_REST_URL", * token: "UPSTASH_REDIS_REST_TOKEN", * }, + * ttl: 3600, // Optional: Cache entries will expire after 1 hour * }); * // Initialize the OpenAI model with Upstash Redis cache for caching responses * const model = new ChatOpenAI({ @@ -42,9 +47,12 @@ export type UpstashRedisCacheProps = { export class UpstashRedisCache extends BaseCache { private redisClient: Redis; + private ttl?: number; + constructor(props: UpstashRedisCacheProps) { super(); - const { config, client } = props; + const { config, client, ttl } = props; + this.ttl = ttl; if (client) { this.redisClient = client; @@ -84,10 +92,13 @@ export class UpstashRedisCache extends BaseCache { public async update(prompt: string, llmKey: string, value: Generation[]) { for (let i = 0; i < value.length; i += 1) { const key = getCacheKey(prompt, llmKey, String(i)); - await this.redisClient.set( - key, - JSON.stringify(serializeGeneration(value[i])) - ); + const serializedValue = JSON.stringify(serializeGeneration(value[i])); + + if (this.ttl) { + await this.redisClient.set(key, serializedValue, { ex: this.ttl }); + } else { + await this.redisClient.set(key, serializedValue); + } } } } From a51bb4a2f435687600469e58b0f8a01bf9b7a67b Mon Sep 17 00:00:00 2001 From: Eduard-Constantin Ibinceanu Date: Tue, 24 Dec 2024 03:35:12 +0200 Subject: [PATCH 2/7] fix(community): Change airtable API request to use POST (#7408) --- .../src/document_loaders/web/airtable.ts | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/libs/langchain-community/src/document_loaders/web/airtable.ts b/libs/langchain-community/src/document_loaders/web/airtable.ts index 6ec5bb094310..be570e7a2759 100644 --- a/libs/langchain-community/src/document_loaders/web/airtable.ts +++ b/libs/langchain-community/src/document_loaders/web/airtable.ts @@ -8,6 +8,7 @@ export interface AirtableRequestParams { view?: string; maxRecords?: number; filterByFormula?: string; + fields?: string[]; } export interface AirtableLoaderOptions { @@ -76,8 +77,8 @@ export class AirtableLoader extends BaseDocumentLoader { try { do { - const url = this.constructUrl(offset); - const data = await this.asyncCaller.call(() => this.fetchRecords(url)); + const body = this.constructRequestBody(offset); + const data = await this.asyncCaller.call(() => this.fetchRecords(body)); data.records.forEach((record: AirtableRecord) => documents.push(this.createDocument(record)) ); @@ -102,8 +103,8 @@ export class AirtableLoader extends BaseDocumentLoader { let offset: string | undefined; try { do { - const url = this.constructUrl(offset); - const data = await this.asyncCaller.call(() => this.fetchRecords(url)); + const body = this.constructRequestBody(offset); + const data = await this.asyncCaller.call(() => this.fetchRecords(body)); for (const record of data.records) { yield this.createDocument(record); @@ -118,37 +119,35 @@ export class AirtableLoader extends BaseDocumentLoader { } /** - * Constructs the Airtable API request URL with pagination and query parameters. + * Constructs the request body for an API call. * - * @param offset - The pagination offset returned by the previous request. - * @returns A fully constructed URL for the API request. + * @param offset - An optional string representing the offset for pagination. + * @returns A record containing the combined properties of `kwargs` and the provided offset. */ - private constructUrl(offset?: string): string { - const url = new URL( - `${AirtableLoader.BASE_URL}/${this.baseId}/${this.tableId}` - ); - if (offset) url.searchParams.append("offset", offset); - if (this.kwargs.view) url.searchParams.append("view", this.kwargs.view); - if (this.kwargs.maxRecords) - url.searchParams.append("maxRecords", this.kwargs.maxRecords.toString()); - if (this.kwargs.filterByFormula) - url.searchParams.append("filterByFormula", this.kwargs.filterByFormula); - return url.toString(); + private constructRequestBody(offset?: string): Record { + return { ...this.kwargs, offset }; } /** * Sends the API request to Airtable and handles the response. * Includes a timeout to prevent hanging on unresponsive requests. * - * @param url - The Airtable API request URL. + * @param body - The request payload to be sent to the Airtable API. * @returns A promise that resolves to an AirtableResponse object. + * @throws Will throw an error if the Airtable API request fails. */ - private async fetchRecords(url: string): Promise { + private async fetchRecords( + body: Record + ): Promise { + const url = `${AirtableLoader.BASE_URL}/${this.baseId}/${this.tableId}`; try { const response = await fetch(url, { + method: "POST", headers: { Authorization: `Bearer ${this.apiToken}`, + "Content-Type": "application/json", }, + body: JSON.stringify(body), }); if (!response.ok) { From 903948448d92648fae57d389b24ced896f991d00 Mon Sep 17 00:00:00 2001 From: AM Date: Mon, 23 Dec 2024 20:43:58 -0500 Subject: [PATCH 3/7] fix(cohere,langchain): handle exceptions in compressDocuments and formatDocuments methods due to empty documents (#7372) --- langchain/src/chains/combine_documents/base.ts | 3 +++ libs/langchain-cohere/src/rerank.ts | 3 +++ 2 files changed, 6 insertions(+) diff --git a/langchain/src/chains/combine_documents/base.ts b/langchain/src/chains/combine_documents/base.ts index cf4cb70c833c..5ffa53df6c3d 100644 --- a/langchain/src/chains/combine_documents/base.ts +++ b/langchain/src/chains/combine_documents/base.ts @@ -21,6 +21,9 @@ export async function formatDocuments({ documents: Document[]; config?: RunnableConfig; }) { + if (documents == null || documents.length === 0) { + return ""; + } const formattedDocs = await Promise.all( documents.map((document) => documentPrompt diff --git a/libs/langchain-cohere/src/rerank.ts b/libs/langchain-cohere/src/rerank.ts index 78a13efa2f83..0dd5834e2527 100644 --- a/libs/langchain-cohere/src/rerank.ts +++ b/libs/langchain-cohere/src/rerank.ts @@ -60,6 +60,9 @@ export class CohereRerank extends BaseDocumentCompressor { documents: Array, query: string ): Promise> { + if (documents == null || documents.length === 0) { + return []; + } const _docs = documents.map((doc) => doc.pageContent); const { results } = await this.client.rerank({ model: this.model, From a7dd5d2a68f5c9d7ef10fd1af672c18e33d0cd78 Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Mon, 23 Dec 2024 18:01:00 -0800 Subject: [PATCH 4/7] release(cohere): 0.3.2 (#7424) --- libs/langchain-cohere/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain-cohere/package.json b/libs/langchain-cohere/package.json index 2f9eedec14c7..e2a3cbfe860e 100644 --- a/libs/langchain-cohere/package.json +++ b/libs/langchain-cohere/package.json @@ -1,6 +1,6 @@ { "name": "@langchain/cohere", - "version": "0.3.1", + "version": "0.3.2", "description": "Cohere integration for LangChain.js", "type": "module", "engines": { From be3fc04304416f83487b44c376cfce4c7daa9597 Mon Sep 17 00:00:00 2001 From: boni-teppanyaki <94654557+boni-teppanyaki@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:54:08 -0500 Subject: [PATCH 5/7] feat(community): Port ArxivRetriever to LangChainJS (#7250) Co-authored-by: Antonio Ferreras Co-authored-by: Dhruvin Patel Co-authored-by: Yiran Gogo Yu Co-authored-by: Jacob Lee --- .../retrievers/arxiv-retriever.mdx | 99 ++++++ examples/src/retrievers/arxiv.ts | 65 ++++ libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 2 + libs/langchain-community/package.json | 18 + .../src/load/import_constants.ts | 1 + .../src/retrievers/arxiv.ts | 50 +++ .../src/retrievers/tests/arxiv.int.test.ts | 318 ++++++++++++++++++ libs/langchain-community/src/utils/arxiv.ts | 242 +++++++++++++ yarn.lock | 15 + 10 files changed, 814 insertions(+) create mode 100644 docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx create mode 100644 examples/src/retrievers/arxiv.ts create mode 100644 libs/langchain-community/src/retrievers/arxiv.ts create mode 100644 libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts create mode 100644 libs/langchain-community/src/utils/arxiv.ts diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx new file mode 100644 index 000000000000..254c90ca49fe --- /dev/null +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -0,0 +1,99 @@ +# ArxivRetriever + +The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. + +For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) + +## Features + +- Query Flexibility: Search using natural language queries or specific arXiv IDs. +- Full-Document Retrieval: Option to fetch and parse PDFs. +- Summaries as Documents: Retrieve summaries for faster results. +- Customizable Options: Configure maximum results and output format. + +## Integration details + +| Retriever | Source | Package | +| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- | +| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) | + +## Setup + +Ensure the following dependencies are installed: + +- `pdf-parse` for parsing PDFs +- `fast-xml-parser` for parsing XML responses from the arXiv API + +```npm2yarn +npm install pdf-parse fast-xml-parser +``` + +## Instantiation + +```typescript +const retriever = new ArxivRetriever({ + getFullDocuments: false, // Set to true to fetch full documents (PDFs) + maxSearchResults: 5, // Maximum number of results to retrieve +}); +``` + +## Usage + +Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. + +```typescript +const query = "quantum computing"; + +const documents = await retriever.invoke(query); +documents.forEach((doc) => { + console.log("Title:", doc.metadata.title); + console.log("Content:", doc.pageContent); // Parsed PDF content +}); +``` + +## Use within a chain + +Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain: + +```typescript +import { ChatOpenAI } from "@langchain/openai"; +import { ChatPromptTemplate } from "@langchain/core/prompts"; +import { + RunnablePassthrough, + RunnableSequence, +} from "@langchain/core/runnables"; +import { StringOutputParser } from "@langchain/core/output_parsers"; +import type { Document } from "@langchain/core/documents"; + +const llm = new ChatOpenAI({ + model: "gpt-4o-mini", + temperature: 0, +}); + +const prompt = ChatPromptTemplate.fromTemplate(` +Answer the question based only on the context provided. + +Context: {context} + +Question: {question}`); + +const formatDocs = (docs: Document[]) => { + return docs.map((doc) => doc.pageContent).join("\n\n"); +}; + +const ragChain = RunnableSequence.from([ + { + context: retriever.pipe(formatDocs), + question: new RunnablePassthrough(), + }, + prompt, + llm, + new StringOutputParser(), +]); + +await ragChain.invoke("What are the latest advances in quantum computing?"); +``` + +## API reference + +For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts new file mode 100644 index 000000000000..3e74502e7d49 --- /dev/null +++ b/examples/src/retrievers/arxiv.ts @@ -0,0 +1,65 @@ +import { ArxivRetriever } from "@langchain/community/retrievers/arxiv"; + +export const run = async () => { + /* + Direct look up by arXiv ID, for full texts + */ + + const queryId = "1605.08386 2103.03404"; + const retrieverById = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 5, + }); + const documentsById = await retrieverById.invoke(queryId); + console.log(documentsById); + + /* + [ + Document + { + pageContent, + metadata: + { + author, + id, + published, + source, + updated, + url + } + }, + Document + { + pageContent, + metadata + } + ] + */ + + /* + Search with natural language query, for summaries + */ + + const queryNat = "What is the ImageBind model?"; + const retrieverByNat = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 2, + }); + const documentsByQuery = await retrieverByNat.invoke(queryNat); + console.log(documentsByQuery); + + /* + [ + Document + { + pageContent, + metadata + }, + Document + { + pageContent, + metadata + } + ] + */ +}; diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 4fde6ded00ff..dcef7c9a15d9 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -626,6 +626,10 @@ retrievers/amazon_knowledge_base.cjs retrievers/amazon_knowledge_base.js retrievers/amazon_knowledge_base.d.ts retrievers/amazon_knowledge_base.d.cts +retrievers/arxiv.cjs +retrievers/arxiv.js +retrievers/arxiv.d.ts +retrievers/arxiv.d.cts retrievers/bm25.cjs retrievers/bm25.js retrievers/bm25.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 62abeef26886..f0c1914d5e78 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -198,6 +198,7 @@ export const config = { // retrievers "retrievers/amazon_kendra": "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base", + "retrievers/arxiv": "retrievers/arxiv", "retrievers/bm25": "retrievers/bm25", "retrievers/chaindesk": "retrievers/chaindesk", "retrievers/databerry": "retrievers/databerry", @@ -437,6 +438,7 @@ export const config = { "chat_models/zhipuai", "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base", + "retrievers/arxiv", "retrievers/dria", "retrievers/metal", "retrievers/supabase", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index b759b36153af..1a45528ec5b6 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -166,6 +166,7 @@ "eslint-plugin-no-instanceof": "^1.0.1", "eslint-plugin-prettier": "^4.2.1", "faiss-node": "^0.5.1", + "fast-xml-parser": "^4.5.1", "firebase-admin": "^11.9.0 || ^12.0.0", "google-auth-library": "^9.10.0", "googleapis": "^126.0.1", @@ -302,6 +303,7 @@ "duck-duck-scrape": "^2.2.5", "epub2": "^3.0.1", "faiss-node": "^0.5.1", + "fast-xml-parser": "*", "firebase-admin": "^11.9.0 || ^12.0.0", "google-auth-library": "*", "googleapis": "*", @@ -584,6 +586,9 @@ "faiss-node": { "optional": true }, + "fast-xml-parser": { + "optional": true + }, "firebase-admin": { "optional": true }, @@ -2125,6 +2130,15 @@ "import": "./retrievers/amazon_knowledge_base.js", "require": "./retrievers/amazon_knowledge_base.cjs" }, + "./retrievers/arxiv": { + "types": { + "import": "./retrievers/arxiv.d.ts", + "require": "./retrievers/arxiv.d.cts", + "default": "./retrievers/arxiv.d.ts" + }, + "import": "./retrievers/arxiv.js", + "require": "./retrievers/arxiv.cjs" + }, "./retrievers/bm25": { "types": { "import": "./retrievers/bm25.d.ts", @@ -3774,6 +3788,10 @@ "retrievers/amazon_knowledge_base.js", "retrievers/amazon_knowledge_base.d.ts", "retrievers/amazon_knowledge_base.d.cts", + "retrievers/arxiv.cjs", + "retrievers/arxiv.js", + "retrievers/arxiv.d.ts", + "retrievers/arxiv.d.cts", "retrievers/bm25.cjs", "retrievers/bm25.js", "retrievers/bm25.d.ts", diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index 5930f82690db..014d418e872d 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/callbacks/handlers/upstash_ratelimit", "langchain_community/retrievers/amazon_kendra", "langchain_community/retrievers/amazon_knowledge_base", + "langchain_community/retrievers/arxiv", "langchain_community/retrievers/dria", "langchain_community/retrievers/metal", "langchain_community/retrievers/supabase", diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts new file mode 100644 index 000000000000..8009ce9f8320 --- /dev/null +++ b/libs/langchain-community/src/retrievers/arxiv.ts @@ -0,0 +1,50 @@ +import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; +import { Document } from "@langchain/core/documents"; +import { + searchArxiv, + loadDocsFromResults, + getDocsFromSummaries, +} from "../utils/arxiv.js"; + +export type ArxivRetrieverOptions = { + getFullDocuments?: boolean; + maxSearchResults?: number; +} & BaseRetrieverInput; + +/** + * A retriever that searches arXiv for relevant articles based on a query. + * It can retrieve either full documents (PDFs) or just summaries. + */ +export class ArxivRetriever extends BaseRetriever { + static lc_name() { + return "ArxivRetriever"; + } + + lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; + + getFullDocuments = false; + + maxSearchResults = 10; + + constructor(options: ArxivRetrieverOptions = {}) { + super(options); + this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments; + this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults; + } + + async _getRelevantDocuments(query: string): Promise { + try { + const results = await searchArxiv(query, this.maxSearchResults); + + if (this.getFullDocuments) { + // Fetch and parse PDFs to get full documents + return await loadDocsFromResults(results); + } else { + // Use summaries as documents + return getDocsFromSummaries(results); + } + } catch (error) { + throw new Error(`Error retrieving documents from arXiv.`); + } + } +} diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts new file mode 100644 index 000000000000..bb05f11504e5 --- /dev/null +++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts @@ -0,0 +1,318 @@ +import { test, expect } from "@jest/globals"; +import { ArxivRetriever } from "../arxiv.js"; + +test("ArxivRetriever fetching document summaries test", async () => { + // Sample integration test for ArxivRetriever using the "machine learning" query + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } +}); + +test("ArxivRetriever fetching document summaries with invalid query test", async () => { + // Sample test for ArxivRetriever using an invalid query + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching document summaries with empty query test", async () => { + // Sample test for ArxivRetriever using an empty query + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => { + // Sample test for ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: -1, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => { + // Sample test for ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 0, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching full documents test", async () => { + // Sample test for fetching full documents with ArxivRetriever + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 5, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("id"); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("summary"); + } +}); + +test("ArxivRetriever fetching full documents with invalid query test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an invalid query + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 5, + }); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching full documents with empty query test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an empty query + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 5, + }); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: -1, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => { + // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 0, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id test", async () => { + // Sample test for fetching articles by arXiv IDs + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(2); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } +}); + +test("ArxivRetriever search articles by id with invalid id test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBeLessThan(3); +}); + +test("ArxivRetriever search articles by id with empty id test", async () => { + // Sample test for fetching articles by arXiv IDs with an empty ID + const fetchIds = ""; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: -1, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: -1, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 0, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts new file mode 100644 index 000000000000..6a79b78a776a --- /dev/null +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -0,0 +1,242 @@ +/* eslint-disable import/no-extraneous-dependencies */ +import { Document } from "@langchain/core/documents"; +import { XMLParser } from "fast-xml-parser"; + +import { PDFLoader } from "../document_loaders/fs/pdf.js"; + +// Interface for processed arXiv entry +interface ArxivEntry { + id: string; + title: string; + summary: string; + published: string; + updated: string; + authors: string[]; + pdfUrl: string; + links: any[]; +} + +// Used to check if the query is an arXiv ID, or a natural language query +export function isArXivIdentifier(query: string): boolean { + const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/; + return arxivIdRegex.test(query.trim()); +} + +// Used to fetch direct arXiv articles by IDs (supports multiple IDs) +export async function fetchDirectArxivArticle( + arxivIds: string +): Promise { + try { + const idList = arxivIds + .split(/[\s,]+/) + .map((id) => id.trim()) + .filter(Boolean) + .join(","); + const url = `http://export.arxiv.org/api/query?id_list=${idList}`; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const xml = await response.text(); + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; + } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with IDs ${arxivIds}`); + } +} + +// Used to fetch arXiv results by natural language query with maxResults parameter +export async function fetchArxivResultsByQuery( + query: string, + start = 0, + maxResults = 10 +): Promise { + try { + const encodedQuery = encodeURIComponent(query); + const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const xml = await response.text(); + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; + } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with query "${query}"`); + } +} + +// Used to search for arXiv articles with a maxResults parameter +export async function searchArxiv( + query: string, + maxResults = 3 +): Promise { + if (isArXivIdentifier(query)) { + return await fetchDirectArxivArticle(query); + } else { + return await fetchArxivResultsByQuery(query, 0, maxResults); + } +} + +// Used to fetch and parse PDF to text +export async function fetchAndParsePDF(pdfUrl: string): Promise { + try { + // Fetch the PDF + const response = await fetch(pdfUrl); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const buffer = await response.arrayBuffer(); + + // Convert the ArrayBuffer to a Blob + const blob = new Blob([buffer], { type: "application/pdf" }); + + // Use PDFLoader to process the PDF + const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob + const docs: Document[] = await loader.load(); + + // Combine all document content into a single string + const content = docs.map((doc) => doc.pageContent).join("\n\n"); + return content; + } catch (error) { + throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`); + } +} + +// Used to load raw text from each search result, and convert to Document instances +export async function loadDocsFromResults( + results: ArxivEntry[] +): Promise { + const docs: Document[] = []; + for (const result of results) { + const pdfUrl = result.pdfUrl; + try { + const pdfContent = await fetchAndParsePDF(pdfUrl); + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: "arxiv", + url: result.id, + summary: result.summary, + }; + const doc = new Document({ + pageContent: pdfContent, + metadata, + }); + docs.push(doc); + } catch (error) { + throw new Error(`Error loading document from ${pdfUrl}`); + } + } + return docs; +} + +// Used to convert metadata and summaries to Document instances +export function getDocsFromSummaries(results: ArxivEntry[]): Document[] { + const docs: Document[] = []; + for (const result of results) { + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: "arxiv", + url: result.id, + }; + const doc = new Document({ + pageContent: result.summary, + metadata, + }); + docs.push(doc); + } + return docs; +} + +// Helper function to process each arXiv entry +function processEntry(entry: any): ArxivEntry { + const id = entry.id; + const title = entry.title.replace(/\s+/g, " ").trim(); + const summary = entry.summary.replace(/\s+/g, " ").trim(); + const published = entry.published; + const updated = entry.updated; + + // Extract authors + let authors: string[] = []; + if (Array.isArray(entry.author)) { + authors = entry.author.map((author: any) => author.name); + } else if (entry.author) { + authors = [entry.author.name]; + } + + // Extract links + let links: any[] = []; + if (Array.isArray(entry.link)) { + links = entry.link; + } else if (entry.link) { + links = [entry.link]; + } + + // Extract PDF link + let pdfUrl = id.replace("/abs/", "/pdf/") + ".pdf"; + const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf"); + if (pdfLinkObj && pdfLinkObj["@_href"]) { + pdfUrl = pdfLinkObj["@_href"]; + } + + return { + id, + title, + summary, + published, + updated, + authors, + pdfUrl, + links, + }; +} diff --git a/yarn.lock b/yarn.lock index 9e5a48455320..abae3190907d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11908,6 +11908,7 @@ __metadata: eslint-plugin-prettier: ^4.2.1 expr-eval: ^2.0.2 faiss-node: ^0.5.1 + fast-xml-parser: ^4.5.1 firebase-admin: ^11.9.0 || ^12.0.0 flat: ^5.0.2 google-auth-library: ^9.10.0 @@ -12050,6 +12051,7 @@ __metadata: duck-duck-scrape: ^2.2.5 epub2: ^3.0.1 faiss-node: ^0.5.1 + fast-xml-parser: "*" firebase-admin: ^11.9.0 || ^12.0.0 google-auth-library: "*" googleapis: "*" @@ -12252,6 +12254,8 @@ __metadata: optional: true faiss-node: optional: true + fast-xml-parser: + optional: true firebase-admin: optional: true google-auth-library: @@ -28227,6 +28231,17 @@ __metadata: languageName: node linkType: hard +"fast-xml-parser@npm:^4.5.1": + version: 4.5.1 + resolution: "fast-xml-parser@npm:4.5.1" + dependencies: + strnum: ^1.0.5 + bin: + fxparser: src/cli/cli.js + checksum: aab32d7f08a95b20f9ecdc2d769531a9dc454faf12740873972f8169c04ab9335ac5df1029ebfe829a01ddbb0ec60572cb7769d6be2409e95a9be8fc6a86e92c + languageName: node + linkType: hard + "fastq@npm:^1.6.0": version: 1.15.0 resolution: "fastq@npm:1.15.0" From 94525f9e09a7ce3e40912938dc6065bfe370631d Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Mon, 23 Dec 2024 18:58:48 -0800 Subject: [PATCH 6/7] release(community): 0.3.20 (#7425) --- libs/langchain-community/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 1a45528ec5b6..bceb60def832 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -1,6 +1,6 @@ { "name": "@langchain/community", - "version": "0.3.19", + "version": "0.3.20", "description": "Third-party integrations for LangChain.js", "type": "module", "engines": { From 45498632ce2f5d539d84d049bf5b6717f674ac46 Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Tue, 24 Dec 2024 10:54:04 -0800 Subject: [PATCH 7/7] Release 0.3.8 (#7427) --- langchain/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/package.json b/langchain/package.json index 34ebd7e4a980..6329b37aac9f 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -1,6 +1,6 @@ { "name": "langchain", - "version": "0.3.7", + "version": "0.3.8", "description": "Typescript bindings for langchain", "type": "module", "engines": {