From 1200ec95d0bca9f0dd463b03e40dd9b0b5bb5b8f Mon Sep 17 00:00:00 2001
From: Chooooo <contact@choo.ooo>
Date: Tue, 24 Dec 2024 10:34:59 +0900
Subject: [PATCH 1/7] feat(community): Add TTL support to UpstashRedisCache
 (#7422)

---
 .../src/cache/chat_models/upstash_redis.ts    |  1 +
 examples/src/cache/upstash_redis.ts           |  1 +
 .../src/caches/upstash_redis.ts               | 21 ++++++++++++++-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/examples/src/cache/chat_models/upstash_redis.ts b/examples/src/cache/chat_models/upstash_redis.ts
index 9b8f2b012af6..a2ff3410c30b 100644
--- a/examples/src/cache/chat_models/upstash_redis.ts
+++ b/examples/src/cache/chat_models/upstash_redis.ts
@@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({
     url: "UPSTASH_REDIS_REST_URL",
     token: "UPSTASH_REDIS_REST_TOKEN",
   },
+  ttl: 3600,
 });
 
 const model = new ChatOpenAI({ cache });
diff --git a/examples/src/cache/upstash_redis.ts b/examples/src/cache/upstash_redis.ts
index e19f0d37908f..e91933104365 100644
--- a/examples/src/cache/upstash_redis.ts
+++ b/examples/src/cache/upstash_redis.ts
@@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({
     url: "UPSTASH_REDIS_REST_URL",
     token: "UPSTASH_REDIS_REST_TOKEN",
   },
+  ttl: 3600,
 });
 
 const model = new OpenAI({ cache });
diff --git a/libs/langchain-community/src/caches/upstash_redis.ts b/libs/langchain-community/src/caches/upstash_redis.ts
index 1cf89e82c826..fcb7e4a4343c 100644
--- a/libs/langchain-community/src/caches/upstash_redis.ts
+++ b/libs/langchain-community/src/caches/upstash_redis.ts
@@ -18,6 +18,10 @@ export type UpstashRedisCacheProps = {
    * An existing Upstash Redis client.
    */
   client?: Redis;
+  /**
+   * Time-to-live (TTL) for cached items in seconds.
+   */
+  ttl?: number;
 };
 
 /**
@@ -30,6 +34,7 @@ export type UpstashRedisCacheProps = {
  *     url: "UPSTASH_REDIS_REST_URL",
  *     token: "UPSTASH_REDIS_REST_TOKEN",
  *   },
+ *   ttl: 3600, // Optional: Cache entries will expire after 1 hour
  * });
  * // Initialize the OpenAI model with Upstash Redis cache for caching responses
  * const model = new ChatOpenAI({
@@ -42,9 +47,12 @@ export type UpstashRedisCacheProps = {
 export class UpstashRedisCache extends BaseCache {
   private redisClient: Redis;
 
+  private ttl?: number;
+
   constructor(props: UpstashRedisCacheProps) {
     super();
-    const { config, client } = props;
+    const { config, client, ttl } = props;
+    this.ttl = ttl;
 
     if (client) {
       this.redisClient = client;
@@ -84,10 +92,13 @@ export class UpstashRedisCache extends BaseCache {
   public async update(prompt: string, llmKey: string, value: Generation[]) {
     for (let i = 0; i < value.length; i += 1) {
       const key = getCacheKey(prompt, llmKey, String(i));
-      await this.redisClient.set(
-        key,
-        JSON.stringify(serializeGeneration(value[i]))
-      );
+      const serializedValue = JSON.stringify(serializeGeneration(value[i]));
+
+      if (this.ttl) {
+        await this.redisClient.set(key, serializedValue, { ex: this.ttl });
+      } else {
+        await this.redisClient.set(key, serializedValue);
+      }
     }
   }
 }

From a51bb4a2f435687600469e58b0f8a01bf9b7a67b Mon Sep 17 00:00:00 2001
From: Eduard-Constantin Ibinceanu <ibinceanu.eduard@yahoo.com>
Date: Tue, 24 Dec 2024 03:35:12 +0200
Subject: [PATCH 2/7] fix(community): Change airtable API request to use POST
 (#7408)

---
 .../src/document_loaders/web/airtable.ts      | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/libs/langchain-community/src/document_loaders/web/airtable.ts b/libs/langchain-community/src/document_loaders/web/airtable.ts
index 6ec5bb094310..be570e7a2759 100644
--- a/libs/langchain-community/src/document_loaders/web/airtable.ts
+++ b/libs/langchain-community/src/document_loaders/web/airtable.ts
@@ -8,6 +8,7 @@ export interface AirtableRequestParams {
   view?: string;
   maxRecords?: number;
   filterByFormula?: string;
+  fields?: string[];
 }
 
 export interface AirtableLoaderOptions {
@@ -76,8 +77,8 @@ export class AirtableLoader extends BaseDocumentLoader {
 
     try {
       do {
-        const url = this.constructUrl(offset);
-        const data = await this.asyncCaller.call(() => this.fetchRecords(url));
+        const body = this.constructRequestBody(offset);
+        const data = await this.asyncCaller.call(() => this.fetchRecords(body));
         data.records.forEach((record: AirtableRecord) =>
           documents.push(this.createDocument(record))
         );
@@ -102,8 +103,8 @@ export class AirtableLoader extends BaseDocumentLoader {
     let offset: string | undefined;
     try {
       do {
-        const url = this.constructUrl(offset);
-        const data = await this.asyncCaller.call(() => this.fetchRecords(url));
+        const body = this.constructRequestBody(offset);
+        const data = await this.asyncCaller.call(() => this.fetchRecords(body));
 
         for (const record of data.records) {
           yield this.createDocument(record);
@@ -118,37 +119,35 @@ export class AirtableLoader extends BaseDocumentLoader {
   }
 
   /**
-   * Constructs the Airtable API request URL with pagination and query parameters.
+   * Constructs the request body for an API call.
    *
-   * @param offset - The pagination offset returned by the previous request.
-   * @returns A fully constructed URL for the API request.
+   * @param offset - An optional string representing the offset for pagination.
+   * @returns A record containing the combined properties of `kwargs` and the provided offset.
    */
-  private constructUrl(offset?: string): string {
-    const url = new URL(
-      `${AirtableLoader.BASE_URL}/${this.baseId}/${this.tableId}`
-    );
-    if (offset) url.searchParams.append("offset", offset);
-    if (this.kwargs.view) url.searchParams.append("view", this.kwargs.view);
-    if (this.kwargs.maxRecords)
-      url.searchParams.append("maxRecords", this.kwargs.maxRecords.toString());
-    if (this.kwargs.filterByFormula)
-      url.searchParams.append("filterByFormula", this.kwargs.filterByFormula);
-    return url.toString();
+  private constructRequestBody(offset?: string): Record<string, any> {
+    return { ...this.kwargs, offset };
   }
 
   /**
    * Sends the API request to Airtable and handles the response.
    * Includes a timeout to prevent hanging on unresponsive requests.
    *
-   * @param url - The Airtable API request URL.
+   * @param body - The request payload to be sent to the Airtable API.
    * @returns A promise that resolves to an AirtableResponse object.
+   * @throws Will throw an error if the Airtable API request fails.
    */
-  private async fetchRecords(url: string): Promise<AirtableResponse> {
+  private async fetchRecords(
+    body: Record<string, any>
+  ): Promise<AirtableResponse> {
+    const url = `${AirtableLoader.BASE_URL}/${this.baseId}/${this.tableId}`;
     try {
       const response = await fetch(url, {
+        method: "POST",
         headers: {
           Authorization: `Bearer ${this.apiToken}`,
+          "Content-Type": "application/json",
         },
+        body: JSON.stringify(body),
       });
 
       if (!response.ok) {

From 903948448d92648fae57d389b24ced896f991d00 Mon Sep 17 00:00:00 2001
From: AM <alihassan.m@hotmail.com>
Date: Mon, 23 Dec 2024 20:43:58 -0500
Subject: [PATCH 3/7] fix(cohere,langchain): handle exceptions in
 compressDocuments and formatDocuments methods due to empty documents (#7372)

---
 langchain/src/chains/combine_documents/base.ts | 3 +++
 libs/langchain-cohere/src/rerank.ts            | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/langchain/src/chains/combine_documents/base.ts b/langchain/src/chains/combine_documents/base.ts
index cf4cb70c833c..5ffa53df6c3d 100644
--- a/langchain/src/chains/combine_documents/base.ts
+++ b/langchain/src/chains/combine_documents/base.ts
@@ -21,6 +21,9 @@ export async function formatDocuments({
   documents: Document[];
   config?: RunnableConfig;
 }) {
+  if (documents == null || documents.length === 0) {
+    return "";
+  }
   const formattedDocs = await Promise.all(
     documents.map((document) =>
       documentPrompt
diff --git a/libs/langchain-cohere/src/rerank.ts b/libs/langchain-cohere/src/rerank.ts
index 78a13efa2f83..0dd5834e2527 100644
--- a/libs/langchain-cohere/src/rerank.ts
+++ b/libs/langchain-cohere/src/rerank.ts
@@ -60,6 +60,9 @@ export class CohereRerank extends BaseDocumentCompressor {
     documents: Array<DocumentInterface>,
     query: string
   ): Promise<Array<DocumentInterface>> {
+    if (documents == null || documents.length === 0) {
+      return [];
+    }
     const _docs = documents.map((doc) => doc.pageContent);
     const { results } = await this.client.rerank({
       model: this.model,

From a7dd5d2a68f5c9d7ef10fd1af672c18e33d0cd78 Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:01:00 -0800
Subject: [PATCH 4/7] release(cohere): 0.3.2 (#7424)

---
 libs/langchain-cohere/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/langchain-cohere/package.json b/libs/langchain-cohere/package.json
index 2f9eedec14c7..e2a3cbfe860e 100644
--- a/libs/langchain-cohere/package.json
+++ b/libs/langchain-cohere/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langchain/cohere",
-  "version": "0.3.1",
+  "version": "0.3.2",
   "description": "Cohere integration for LangChain.js",
   "type": "module",
   "engines": {

From be3fc04304416f83487b44c376cfce4c7daa9597 Mon Sep 17 00:00:00 2001
From: boni-teppanyaki <94654557+boni-teppanyaki@users.noreply.github.com>
Date: Mon, 23 Dec 2024 21:54:08 -0500
Subject: [PATCH 5/7] feat(community): Port ArxivRetriever to LangChainJS
 (#7250)

Co-authored-by: Antonio Ferreras <antfs10@gmail.com>
Co-authored-by: Dhruvin Patel <pateldhruvin2503@gmail.com>
Co-authored-by: Yiran Gogo Yu <gogoyiranyu@gmail.com>
Co-authored-by: Jacob Lee <jacoblee93@gmail.com>
---
 .../retrievers/arxiv-retriever.mdx            |  99 ++++++
 examples/src/retrievers/arxiv.ts              |  65 ++++
 libs/langchain-community/.gitignore           |   4 +
 libs/langchain-community/langchain.config.js  |   2 +
 libs/langchain-community/package.json         |  18 +
 .../src/load/import_constants.ts              |   1 +
 .../src/retrievers/arxiv.ts                   |  50 +++
 .../src/retrievers/tests/arxiv.int.test.ts    | 318 ++++++++++++++++++
 libs/langchain-community/src/utils/arxiv.ts   | 242 +++++++++++++
 yarn.lock                                     |  15 +
 10 files changed, 814 insertions(+)
 create mode 100644 docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
 create mode 100644 examples/src/retrievers/arxiv.ts
 create mode 100644 libs/langchain-community/src/retrievers/arxiv.ts
 create mode 100644 libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
 create mode 100644 libs/langchain-community/src/utils/arxiv.ts

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
new file mode 100644
index 000000000000..254c90ca49fe
--- /dev/null
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -0,0 +1,99 @@
+# ArxivRetriever
+
+The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.
+
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
+
+## Features
+
+- Query Flexibility: Search using natural language queries or specific arXiv IDs.
+- Full-Document Retrieval: Option to fetch and parse PDFs.
+- Summaries as Documents: Retrieve summaries for faster results.
+- Customizable Options: Configure maximum results and output format.
+
+## Integration details
+
+| Retriever        | Source                       | Package                                                                      |
+| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
+| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |
+
+## Setup
+
+Ensure the following dependencies are installed:
+
+- `pdf-parse` for parsing PDFs
+- `fast-xml-parser` for parsing XML responses from the arXiv API
+
+```npm2yarn
+npm install pdf-parse fast-xml-parser
+```
+
+## Instantiation
+
+```typescript
+const retriever = new ArxivRetriever({
+  getFullDocuments: false, // Set to true to fetch full documents (PDFs)
+  maxSearchResults: 5, // Maximum number of results to retrieve
+});
+```
+
+## Usage
+
+Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
+
+```typescript
+const query = "quantum computing";
+
+const documents = await retriever.invoke(query);
+documents.forEach((doc) => {
+  console.log("Title:", doc.metadata.title);
+  console.log("Content:", doc.pageContent); // Parsed PDF content
+});
+```
+
+## Use within a chain
+
+Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain:
+
+```typescript
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import {
+  RunnablePassthrough,
+  RunnableSequence,
+} from "@langchain/core/runnables";
+import { StringOutputParser } from "@langchain/core/output_parsers";
+import type { Document } from "@langchain/core/documents";
+
+const llm = new ChatOpenAI({
+  model: "gpt-4o-mini",
+  temperature: 0,
+});
+
+const prompt = ChatPromptTemplate.fromTemplate(`
+Answer the question based only on the context provided.
+
+Context: {context}
+
+Question: {question}`);
+
+const formatDocs = (docs: Document[]) => {
+  return docs.map((doc) => doc.pageContent).join("\n\n");
+};
+
+const ragChain = RunnableSequence.from([
+  {
+    context: retriever.pipe(formatDocs),
+    question: new RunnablePassthrough(),
+  },
+  prompt,
+  llm,
+  new StringOutputParser(),
+]);
+
+await ragChain.invoke("What are the latest advances in quantum computing?");
+```
+
+## API reference
+
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts
new file mode 100644
index 000000000000..3e74502e7d49
--- /dev/null
+++ b/examples/src/retrievers/arxiv.ts
@@ -0,0 +1,65 @@
+import { ArxivRetriever } from "@langchain/community/retrievers/arxiv";
+
+export const run = async () => {
+  /*
+    Direct look up by arXiv ID, for full texts
+  */
+
+  const queryId = "1605.08386 2103.03404";
+  const retrieverById = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const documentsById = await retrieverById.invoke(queryId);
+  console.log(documentsById);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata: 
+      {
+        author,
+        id,
+        published,
+        source,
+        updated,
+        url
+      }
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+
+  /*
+  Search with natural language query, for summaries
+  */
+
+  const queryNat = "What is the ImageBind model?";
+  const retrieverByNat = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 2,
+  });
+  const documentsByQuery = await retrieverByNat.invoke(queryNat);
+  console.log(documentsByQuery);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+};
diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore
index 4fde6ded00ff..dcef7c9a15d9 100644
--- a/libs/langchain-community/.gitignore
+++ b/libs/langchain-community/.gitignore
@@ -626,6 +626,10 @@ retrievers/amazon_knowledge_base.cjs
 retrievers/amazon_knowledge_base.js
 retrievers/amazon_knowledge_base.d.ts
 retrievers/amazon_knowledge_base.d.cts
+retrievers/arxiv.cjs
+retrievers/arxiv.js
+retrievers/arxiv.d.ts
+retrievers/arxiv.d.cts
 retrievers/bm25.cjs
 retrievers/bm25.js
 retrievers/bm25.d.ts
diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
index 62abeef26886..f0c1914d5e78 100644
--- a/libs/langchain-community/langchain.config.js
+++ b/libs/langchain-community/langchain.config.js
@@ -198,6 +198,7 @@ export const config = {
     // retrievers
     "retrievers/amazon_kendra": "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv": "retrievers/arxiv",
     "retrievers/bm25": "retrievers/bm25",
     "retrievers/chaindesk": "retrievers/chaindesk",
     "retrievers/databerry": "retrievers/databerry",
@@ -437,6 +438,7 @@ export const config = {
     "chat_models/zhipuai",
     "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv",
     "retrievers/dria",
     "retrievers/metal",
     "retrievers/supabase",
diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
index b759b36153af..1a45528ec5b6 100644
--- a/libs/langchain-community/package.json
+++ b/libs/langchain-community/package.json
@@ -166,6 +166,7 @@
     "eslint-plugin-no-instanceof": "^1.0.1",
     "eslint-plugin-prettier": "^4.2.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "^4.5.1",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "^9.10.0",
     "googleapis": "^126.0.1",
@@ -302,6 +303,7 @@
     "duck-duck-scrape": "^2.2.5",
     "epub2": "^3.0.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "*",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "*",
     "googleapis": "*",
@@ -584,6 +586,9 @@
     "faiss-node": {
       "optional": true
     },
+    "fast-xml-parser": {
+      "optional": true
+    },
     "firebase-admin": {
       "optional": true
     },
@@ -2125,6 +2130,15 @@
       "import": "./retrievers/amazon_knowledge_base.js",
       "require": "./retrievers/amazon_knowledge_base.cjs"
     },
+    "./retrievers/arxiv": {
+      "types": {
+        "import": "./retrievers/arxiv.d.ts",
+        "require": "./retrievers/arxiv.d.cts",
+        "default": "./retrievers/arxiv.d.ts"
+      },
+      "import": "./retrievers/arxiv.js",
+      "require": "./retrievers/arxiv.cjs"
+    },
     "./retrievers/bm25": {
       "types": {
         "import": "./retrievers/bm25.d.ts",
@@ -3774,6 +3788,10 @@
     "retrievers/amazon_knowledge_base.js",
     "retrievers/amazon_knowledge_base.d.ts",
     "retrievers/amazon_knowledge_base.d.cts",
+    "retrievers/arxiv.cjs",
+    "retrievers/arxiv.js",
+    "retrievers/arxiv.d.ts",
+    "retrievers/arxiv.d.cts",
     "retrievers/bm25.cjs",
     "retrievers/bm25.js",
     "retrievers/bm25.d.ts",
diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts
index 5930f82690db..014d418e872d 100644
--- a/libs/langchain-community/src/load/import_constants.ts
+++ b/libs/langchain-community/src/load/import_constants.ts
@@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [
   "langchain_community/callbacks/handlers/upstash_ratelimit",
   "langchain_community/retrievers/amazon_kendra",
   "langchain_community/retrievers/amazon_knowledge_base",
+  "langchain_community/retrievers/arxiv",
   "langchain_community/retrievers/dria",
   "langchain_community/retrievers/metal",
   "langchain_community/retrievers/supabase",
diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts
new file mode 100644
index 000000000000..8009ce9f8320
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/arxiv.ts
@@ -0,0 +1,50 @@
+import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
+import { Document } from "@langchain/core/documents";
+import {
+  searchArxiv,
+  loadDocsFromResults,
+  getDocsFromSummaries,
+} from "../utils/arxiv.js";
+
+export type ArxivRetrieverOptions = {
+  getFullDocuments?: boolean;
+  maxSearchResults?: number;
+} & BaseRetrieverInput;
+
+/**
+ * A retriever that searches arXiv for relevant articles based on a query.
+ * It can retrieve either full documents (PDFs) or just summaries.
+ */
+export class ArxivRetriever extends BaseRetriever {
+  static lc_name() {
+    return "ArxivRetriever";
+  }
+
+  lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
+
+  getFullDocuments = false;
+
+  maxSearchResults = 10;
+
+  constructor(options: ArxivRetrieverOptions = {}) {
+    super(options);
+    this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments;
+    this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults;
+  }
+
+  async _getRelevantDocuments(query: string): Promise<Document[]> {
+    try {
+      const results = await searchArxiv(query, this.maxSearchResults);
+
+      if (this.getFullDocuments) {
+        // Fetch and parse PDFs to get full documents
+        return await loadDocsFromResults(results);
+      } else {
+        // Use summaries as documents
+        return getDocsFromSummaries(results);
+      }
+    } catch (error) {
+      throw new Error(`Error retrieving documents from arXiv.`);
+    }
+  }
+}
diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
new file mode 100644
index 000000000000..bb05f11504e5
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
@@ -0,0 +1,318 @@
+import { test, expect } from "@jest/globals";
+import { ArxivRetriever } from "../arxiv.js";
+
+test("ArxivRetriever fetching document summaries test", async () => {
+  // Sample integration test for ArxivRetriever using the "machine learning" query
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "machine learning";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeGreaterThan(0);
+  expect(results.length).toBeLessThanOrEqual(5);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+  }
+});
+
+test("ArxivRetriever fetching document summaries with invalid query test", async () => {
+  // Sample test for ArxivRetriever using an invalid query
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "fjalsdkjfw";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching document summaries with empty query test", async () => {
+  // Sample test for ArxivRetriever using an empty query
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => {
+  // Sample test for ArxivRetriever using an invalid maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      getFullDocuments: true,
+      maxSearchResults: -1,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => {
+  // Sample test for ArxivRetriever using an zero maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      getFullDocuments: true,
+      maxSearchResults: 0,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever fetching full documents test", async () => {
+  // Sample test for fetching full documents with ArxivRetriever
+  const retriever = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "machine learning";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeGreaterThan(0);
+  expect(results.length).toBeLessThanOrEqual(5);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("id");
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("summary");
+  }
+});
+
+test("ArxivRetriever fetching full documents with invalid query test", async () => {
+  // Sample test for fetching full documents with ArxivRetriever using an invalid query
+  const retriever = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "fjalsdkjfw";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching full documents with empty query test", async () => {
+  // Sample test for fetching full documents with ArxivRetriever using an empty query
+  const retriever = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => {
+  // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      getFullDocuments: true,
+      maxSearchResults: -1,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => {
+  // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      getFullDocuments: true,
+      maxSearchResults: 0,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever search articles by id test", async () => {
+  // Sample test for fetching articles by arXiv IDs
+  const fetchIds = "2103.03404 2103.03405";
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(2);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+  }
+});
+
+test("ArxivRetriever search articles by id with invalid id test", async () => {
+  // Sample test for fetching articles by arXiv IDs with an invalid ID
+  const fetchIds = "2103.03404 2103.03405 1234.56789";
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeLessThan(3);
+});
+
+test("ArxivRetriever search articles by id with empty id test", async () => {
+  // Sample test for fetching articles by arXiv IDs with an empty ID
+  const fetchIds = "";
+  const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => {
+  // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405";
+    const retriever = new ArxivRetriever({
+      getFullDocuments: false,
+      maxSearchResults: -1,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => {
+  // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405 1234.56789";
+    const retriever = new ArxivRetriever({
+      getFullDocuments: false,
+      maxSearchResults: -1,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
+
+test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => {
+  // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405 1234.56789";
+    const retriever = new ArxivRetriever({
+      getFullDocuments: false,
+      maxSearchResults: 0,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
new file mode 100644
index 000000000000..6a79b78a776a
--- /dev/null
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -0,0 +1,242 @@
+/* eslint-disable import/no-extraneous-dependencies */
+import { Document } from "@langchain/core/documents";
+import { XMLParser } from "fast-xml-parser";
+
+import { PDFLoader } from "../document_loaders/fs/pdf.js";
+
+// Interface for processed arXiv entry
+interface ArxivEntry {
+  id: string;
+  title: string;
+  summary: string;
+  published: string;
+  updated: string;
+  authors: string[];
+  pdfUrl: string;
+  links: any[];
+}
+
+// Used to check if the query is an arXiv ID, or a natural language query
+export function isArXivIdentifier(query: string): boolean {
+  const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/;
+  return arxivIdRegex.test(query.trim());
+}
+
+// Used to fetch direct arXiv articles by IDs (supports multiple IDs)
+export async function fetchDirectArxivArticle(
+  arxivIds: string
+): Promise<ArxivEntry[]> {
+  try {
+    const idList = arxivIds
+      .split(/[\s,]+/)
+      .map((id) => id.trim())
+      .filter(Boolean)
+      .join(",");
+    const url = `http://export.arxiv.org/api/query?id_list=${idList}`;
+    const response = await fetch(url);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+
+    const xml = await response.text();
+
+    const parser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: "@_",
+    });
+    const result = parser.parse(xml);
+    let entries = result.feed.entry;
+
+    if (!entries) {
+      return [];
+    }
+
+    // Ensure entries is an array
+    if (!Array.isArray(entries)) {
+      entries = [entries];
+    }
+
+    const processedEntries = entries.map(processEntry);
+
+    return processedEntries;
+  } catch (error) {
+    throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);
+  }
+}
+
+// Used to fetch arXiv results by natural language query with maxResults parameter
+export async function fetchArxivResultsByQuery(
+  query: string,
+  start = 0,
+  maxResults = 10
+): Promise<ArxivEntry[]> {
+  try {
+    const encodedQuery = encodeURIComponent(query);
+    const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;
+    const response = await fetch(url);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+
+    const xml = await response.text();
+
+    const parser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: "@_",
+    });
+    const result = parser.parse(xml);
+    let entries = result.feed.entry;
+
+    if (!entries) {
+      return [];
+    }
+
+    // Ensure entries is an array
+    if (!Array.isArray(entries)) {
+      entries = [entries];
+    }
+
+    const processedEntries = entries.map(processEntry);
+
+    return processedEntries;
+  } catch (error) {
+    throw new Error(`Failed to fetch articles with query "${query}"`);
+  }
+}
+
+// Used to search for arXiv articles with a maxResults parameter
+export async function searchArxiv(
+  query: string,
+  maxResults = 3
+): Promise<ArxivEntry[]> {
+  if (isArXivIdentifier(query)) {
+    return await fetchDirectArxivArticle(query);
+  } else {
+    return await fetchArxivResultsByQuery(query, 0, maxResults);
+  }
+}
+
+// Used to fetch and parse PDF to text
+export async function fetchAndParsePDF(pdfUrl: string): Promise<string> {
+  try {
+    // Fetch the PDF
+    const response = await fetch(pdfUrl);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+
+    const buffer = await response.arrayBuffer();
+
+    // Convert the ArrayBuffer to a Blob
+    const blob = new Blob([buffer], { type: "application/pdf" });
+
+    // Use PDFLoader to process the PDF
+    const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob
+    const docs: Document[] = await loader.load();
+
+    // Combine all document content into a single string
+    const content = docs.map((doc) => doc.pageContent).join("\n\n");
+    return content;
+  } catch (error) {
+    throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);
+  }
+}
+
+// Used to load raw text from each search result, and convert to Document instances
+export async function loadDocsFromResults(
+  results: ArxivEntry[]
+): Promise<Document[]> {
+  const docs: Document[] = [];
+  for (const result of results) {
+    const pdfUrl = result.pdfUrl;
+    try {
+      const pdfContent = await fetchAndParsePDF(pdfUrl);
+      const metadata = {
+        id: result.id,
+        title: result.title,
+        authors: result.authors,
+        published: result.published,
+        updated: result.updated,
+        source: "arxiv",
+        url: result.id,
+        summary: result.summary,
+      };
+      const doc = new Document({
+        pageContent: pdfContent,
+        metadata,
+      });
+      docs.push(doc);
+    } catch (error) {
+      throw new Error(`Error loading document from ${pdfUrl}`);
+    }
+  }
+  return docs;
+}
+
+// Used to convert metadata and summaries to Document instances
+export function getDocsFromSummaries(results: ArxivEntry[]): Document[] {
+  const docs: Document[] = [];
+  for (const result of results) {
+    const metadata = {
+      id: result.id,
+      title: result.title,
+      authors: result.authors,
+      published: result.published,
+      updated: result.updated,
+      source: "arxiv",
+      url: result.id,
+    };
+    const doc = new Document({
+      pageContent: result.summary,
+      metadata,
+    });
+    docs.push(doc);
+  }
+  return docs;
+}
+
+// Helper function to process each arXiv entry
+function processEntry(entry: any): ArxivEntry {
+  const id = entry.id;
+  const title = entry.title.replace(/\s+/g, " ").trim();
+  const summary = entry.summary.replace(/\s+/g, " ").trim();
+  const published = entry.published;
+  const updated = entry.updated;
+
+  // Extract authors
+  let authors: string[] = [];
+  if (Array.isArray(entry.author)) {
+    authors = entry.author.map((author: any) => author.name);
+  } else if (entry.author) {
+    authors = [entry.author.name];
+  }
+
+  // Extract links
+  let links: any[] = [];
+  if (Array.isArray(entry.link)) {
+    links = entry.link;
+  } else if (entry.link) {
+    links = [entry.link];
+  }
+
+  // Extract PDF link
+  let pdfUrl = id.replace("/abs/", "/pdf/") + ".pdf";
+  const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf");
+  if (pdfLinkObj && pdfLinkObj["@_href"]) {
+    pdfUrl = pdfLinkObj["@_href"];
+  }
+
+  return {
+    id,
+    title,
+    summary,
+    published,
+    updated,
+    authors,
+    pdfUrl,
+    links,
+  };
+}
diff --git a/yarn.lock b/yarn.lock
index 9e5a48455320..abae3190907d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -11908,6 +11908,7 @@ __metadata:
     eslint-plugin-prettier: ^4.2.1
     expr-eval: ^2.0.2
     faiss-node: ^0.5.1
+    fast-xml-parser: ^4.5.1
     firebase-admin: ^11.9.0 || ^12.0.0
     flat: ^5.0.2
     google-auth-library: ^9.10.0
@@ -12050,6 +12051,7 @@ __metadata:
     duck-duck-scrape: ^2.2.5
     epub2: ^3.0.1
     faiss-node: ^0.5.1
+    fast-xml-parser: "*"
     firebase-admin: ^11.9.0 || ^12.0.0
     google-auth-library: "*"
     googleapis: "*"
@@ -12252,6 +12254,8 @@ __metadata:
       optional: true
     faiss-node:
       optional: true
+    fast-xml-parser:
+      optional: true
     firebase-admin:
       optional: true
     google-auth-library:
@@ -28227,6 +28231,17 @@ __metadata:
   languageName: node
   linkType: hard
 
+"fast-xml-parser@npm:^4.5.1":
+  version: 4.5.1
+  resolution: "fast-xml-parser@npm:4.5.1"
+  dependencies:
+    strnum: ^1.0.5
+  bin:
+    fxparser: src/cli/cli.js
+  checksum: aab32d7f08a95b20f9ecdc2d769531a9dc454faf12740873972f8169c04ab9335ac5df1029ebfe829a01ddbb0ec60572cb7769d6be2409e95a9be8fc6a86e92c
+  languageName: node
+  linkType: hard
+
 "fastq@npm:^1.6.0":
   version: 1.15.0
   resolution: "fastq@npm:1.15.0"

From 94525f9e09a7ce3e40912938dc6065bfe370631d Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:58:48 -0800
Subject: [PATCH 6/7] release(community): 0.3.20 (#7425)

---
 libs/langchain-community/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
index 1a45528ec5b6..bceb60def832 100644
--- a/libs/langchain-community/package.json
+++ b/libs/langchain-community/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langchain/community",
-  "version": "0.3.19",
+  "version": "0.3.20",
   "description": "Third-party integrations for LangChain.js",
   "type": "module",
   "engines": {

From 45498632ce2f5d539d84d049bf5b6717f674ac46 Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Tue, 24 Dec 2024 10:54:04 -0800
Subject: [PATCH 7/7] Release 0.3.8 (#7427)

---
 langchain/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/langchain/package.json b/langchain/package.json
index 34ebd7e4a980..6329b37aac9f 100644
--- a/langchain/package.json
+++ b/langchain/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langchain",
-  "version": "0.3.7",
+  "version": "0.3.8",
   "description": "Typescript bindings for langchain",
   "type": "module",
   "engines": {