From 8f140d790feeb2078603d30b581253c80835aa50 Mon Sep 17 00:00:00 2001
From: Antonio Ferreras <antfs10@gmail.com>
Date: Tue, 12 Nov 2024 00:30:28 -0500
Subject: [PATCH 01/16] create ArxivRetriever, arxiv utils file, and config
 updates

---
 libs/langchain-community/.gitignore           |   4 +
 libs/langchain-community/langchain.config.js  |   1 +
 libs/langchain-community/package.json         |  13 ++
 .../src/load/import_map.ts                    |   1 +
 .../src/retrievers/arxiv.ts                   |  45 ++++
 libs/langchain-community/src/utils/arxiv.ts   | 201 ++++++++++++++++++
 6 files changed, 265 insertions(+)
 create mode 100644 libs/langchain-community/src/retrievers/arxiv.ts
 create mode 100644 libs/langchain-community/src/utils/arxiv.ts

diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore
index 890c93717dea..7e32a755f495 100644
--- a/libs/langchain-community/.gitignore
+++ b/libs/langchain-community/.gitignore
@@ -610,6 +610,10 @@ retrievers/amazon_knowledge_base.cjs
 retrievers/amazon_knowledge_base.js
 retrievers/amazon_knowledge_base.d.ts
 retrievers/amazon_knowledge_base.d.cts
+retrievers/arxiv.cjs
+retrievers/arxiv.js
+retrievers/arxiv.d.ts
+retrievers/arxiv.d.cts
 retrievers/bm25.cjs
 retrievers/bm25.js
 retrievers/bm25.d.ts
diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
index 63b495f92f2c..bb88b074ef48 100644
--- a/libs/langchain-community/langchain.config.js
+++ b/libs/langchain-community/langchain.config.js
@@ -193,6 +193,7 @@ export const config = {
     // retrievers
     "retrievers/amazon_kendra": "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv": "retrievers/arxiv",
     "retrievers/bm25": "retrievers/bm25",
     "retrievers/chaindesk": "retrievers/chaindesk",
     "retrievers/databerry": "retrievers/databerry",
diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
index a1f60050f981..e8746e15b833 100644
--- a/libs/langchain-community/package.json
+++ b/libs/langchain-community/package.json
@@ -2085,6 +2085,15 @@
       "import": "./retrievers/amazon_knowledge_base.js",
       "require": "./retrievers/amazon_knowledge_base.cjs"
     },
+    "./retrievers/arxiv": {
+      "types": {
+        "import": "./retrievers/arxiv.d.ts",
+        "require": "./retrievers/arxiv.d.cts",
+        "default": "./retrievers/arxiv.d.ts"
+      },
+      "import": "./retrievers/arxiv.js",
+      "require": "./retrievers/arxiv.cjs"
+    },
     "./retrievers/bm25": {
       "types": {
         "import": "./retrievers/bm25.d.ts",
@@ -3673,6 +3682,10 @@
     "retrievers/amazon_knowledge_base.js",
     "retrievers/amazon_knowledge_base.d.ts",
     "retrievers/amazon_knowledge_base.d.cts",
+    "retrievers/arxiv.cjs",
+    "retrievers/arxiv.js",
+    "retrievers/arxiv.d.ts",
+    "retrievers/arxiv.d.cts",
     "retrievers/bm25.cjs",
     "retrievers/bm25.js",
     "retrievers/bm25.d.ts",
diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts
index 5bbd9e4d0a01..76870173d7da 100644
--- a/libs/langchain-community/src/load/import_map.ts
+++ b/libs/langchain-community/src/load/import_map.ts
@@ -54,6 +54,7 @@ export * as chat_models__moonshot from "../chat_models/moonshot.js";
 export * as chat_models__ollama from "../chat_models/ollama.js";
 export * as chat_models__togetherai from "../chat_models/togetherai.js";
 export * as chat_models__yandex from "../chat_models/yandex.js";
+export * as retrievers__arxiv from "../retrievers/arxiv.js";
 export * as retrievers__bm25 from "../retrievers/bm25.js";
 export * as retrievers__chaindesk from "../retrievers/chaindesk.js";
 export * as retrievers__databerry from "../retrievers/databerry.js";
diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts
new file mode 100644
index 000000000000..b8bb5a524eea
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/arxiv.ts
@@ -0,0 +1,45 @@
+import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
+import { Document } from "@langchain/core/documents";
+import { searchArxiv, loadDocsFromResults, getDocsFromSummaries } from '../utils/arxiv.js';
+
+export type ArxivRetrieverOptions = {
+    getFullDocuments?: boolean;
+    maxSearchResults?: number;
+} & BaseRetrieverInput;
+
+/**
+ * A retriever that searches arXiv for relevant articles based on a query.
+ * It can retrieve either full documents (PDFs) or just summaries.
+ */
+export class ArxivRetriever extends BaseRetriever {
+    static lc_name() {
+        return "ArxivRetriever";
+    }
+
+    lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
+
+    getFullDocuments: boolean;
+    maxSearchResults: number;
+
+    constructor(options: ArxivRetrieverOptions = {}) {
+        super(options);
+        this.getFullDocuments = options.getFullDocuments ?? false;
+        this.maxSearchResults = options.maxSearchResults ?? 10;
+    }
+
+    async _getRelevantDocuments(query: string): Promise<Document[]> {
+        try {
+            const results = await searchArxiv(query, this.maxSearchResults);
+
+            if (this.getFullDocuments) {
+                // Fetch and parse PDFs to get full documents
+                return await loadDocsFromResults(results);
+            } else {
+                // Use summaries as documents
+                return getDocsFromSummaries(results);
+            }
+        } catch (error) {
+            throw new Error(`Error retrieving documents from arXiv.`);
+        }
+    }
+}
diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
new file mode 100644
index 000000000000..e62e228876e2
--- /dev/null
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -0,0 +1,201 @@
+import axios from 'axios'; // For HTTP requests
+import pdfParse from 'pdf-parse'; // For parsing PDFs
+import { XMLParser } from 'fast-xml-parser'; // For parsing XML
+import { Document } from "@langchain/core/documents";
+
+// Interface for processed arXiv entry
+interface ArxivEntry {
+    id: string;
+    title: string;
+    summary: string;
+    published: string;
+    updated: string;
+    authors: string[];
+    pdfUrl: string;
+    links: any[];
+}
+
+// Used to check if the query is an arXiv ID, or a natural language query
+export function isArXivIdentifier(query: string): boolean {
+    const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/;
+    return arxivIdRegex.test(query.trim());
+}
+
+// Used to fetch direct arXiv articles by IDs (supports multiple IDs)
+export async function fetchDirectArxivArticle(arxivIds: string): Promise<ArxivEntry[]> {
+    try {
+        const idList = arxivIds.split(/[\s,]+/).map(id => id.trim()).filter(Boolean).join(',');
+        const url = `http://export.arxiv.org/api/query?id_list=${idList}`;
+        const response = await axios.get(url);
+        const xml = response.data;
+
+        const parser = new XMLParser({
+            ignoreAttributes: false,
+            attributeNamePrefix: "@_",
+        });
+        const result = parser.parse(xml);
+        let entries = result.feed.entry;
+
+        if (!entries) {
+            return [];
+        }
+
+        // Ensure entries is an array
+        if (!Array.isArray(entries)) {
+            entries = [entries];
+        }
+
+        const processedEntries = entries.map(processEntry);
+
+        return processedEntries;
+    } catch (error) {
+        throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);
+    }
+}
+
+// Used to fetch arXiv results by natural language query with maxResults parameter
+export async function fetchArxivResultsByQuery(query: string, start = 0, maxResults = 10): Promise<ArxivEntry[]> {
+    try {
+        const encodedQuery = encodeURIComponent(query);
+        const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;
+        const response = await axios.get(url);
+        const xml = response.data;
+
+        const parser = new XMLParser({
+            ignoreAttributes: false,
+            attributeNamePrefix: "@_",
+        });
+        const result = parser.parse(xml);
+        let entries = result.feed.entry;
+
+        if (!entries) {
+            return [];
+        }
+
+        // Ensure entries is an array
+        if (!Array.isArray(entries)) {
+            entries = [entries];
+        }
+
+        const processedEntries = entries.map(processEntry);
+
+        return processedEntries;
+    } catch (error) {
+        throw new Error(`Failed to fetch articles with query "${query}"`);
+    }
+}
+
+// Used to search for arXiv articles with a maxResults parameter
+export async function searchArxiv(query: string, maxResults = 3): Promise<ArxivEntry[]> {
+    if (isArXivIdentifier(query)) {
+        return await fetchDirectArxivArticle(query);
+    } else {
+        return await fetchArxivResultsByQuery(query, 0, maxResults);
+    }
+}
+
+// Used to fetch and parse PDF to text
+export async function fetchAndParsePDF(pdfUrl: string): Promise<string> {
+    try {
+        const response = await axios.get(pdfUrl, { responseType: 'arraybuffer' });
+        const buffer = Buffer.from(response.data);
+        const data = await pdfParse(buffer);
+        return data.text;
+    } catch (error) {
+        throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);
+    }
+}
+
+// Used to load raw text from each search result, and convert to Document instances
+export async function loadDocsFromResults(results: ArxivEntry[]): Promise<Document[]> {
+    const docs: Document[] = [];
+    for (const result of results) {
+        const pdfUrl = result.pdfUrl;
+        try {
+            const pdfContent = await fetchAndParsePDF(pdfUrl);
+            const metadata = {
+                id: result.id,
+                title: result.title,
+                authors: result.authors,
+                published: result.published,
+                updated: result.updated,
+                source: 'arxiv',
+                url: result.id,
+                summary: result.summary,
+            };
+            const doc = new Document({
+                pageContent: pdfContent,
+                metadata,
+            });
+            docs.push(doc);
+        } catch (error) {
+            throw new Error(`Error loading document from ${pdfUrl}`);
+        }
+    }
+    return docs;
+}
+
+// Used to convert metadata and summaries to Document instances
+export function getDocsFromSummaries(results: ArxivEntry[]): Document[] {
+    const docs: Document[] = [];
+    for (const result of results) {
+        const metadata = {
+            id: result.id,
+            title: result.title,
+            authors: result.authors,
+            published: result.published,
+            updated: result.updated,
+            source: 'arxiv',
+            url: result.id,
+        };
+        const doc = new Document({
+            pageContent: result.summary,
+            metadata,
+        });
+        docs.push(doc);
+    }
+    return docs;
+}
+
+// Helper function to process each arXiv entry
+function processEntry(entry: any): ArxivEntry {
+    const id = entry.id;
+    const title = entry.title.replace(/\s+/g, ' ').trim();
+    const summary = entry.summary.replace(/\s+/g, ' ').trim();
+    const published = entry.published;
+    const updated = entry.updated;
+
+    // Extract authors
+    let authors: string[] = [];
+    if (Array.isArray(entry.author)) {
+        authors = entry.author.map((author: any) => author.name);
+    } else if (entry.author) {
+        authors = [entry.author.name];
+    }
+
+    // Extract links
+    let links: any[] = [];
+    if (Array.isArray(entry.link)) {
+        links = entry.link;
+    } else if (entry.link) {
+        links = [entry.link];
+    }
+
+    // Extract PDF link
+    let pdfUrl = id.replace('/abs/', '/pdf/') + '.pdf';
+    const pdfLinkObj = links.find((link: any) => link["@_title"] === 'pdf');
+    if (pdfLinkObj && pdfLinkObj["@_href"]) {
+        pdfUrl = pdfLinkObj["@_href"];
+    }
+
+    return {
+        id,
+        title,
+        summary,
+        published,
+        updated,
+        authors,
+        pdfUrl,
+        links,
+    };
+}

From b4d4a699c645f2aaf4cc8e8360ba8f6f43151160 Mon Sep 17 00:00:00 2001
From: Dhruvin Patel <pateldhruvin2503@gmail.com>
Date: Mon, 18 Nov 2024 13:24:50 -0500
Subject: [PATCH 02/16] Documentation for Arxiv-Retriever

---
 .../retrievers/arxiv-retriever.mdx            | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
new file mode 100644
index 000000000000..23a67e7a98df
--- /dev/null
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -0,0 +1,148 @@
+# Documentation for ArxivRetriever in LangChain.js
+---
+
+## Overview
+
+The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.
+
+---
+
+## Installation
+
+Ensure the following dependencies are installed:
+- `axios` for making HTTP requests
+- `pdf-parse` for parsing PDFs
+- `fast-xml-parser` for parsing XML responses from the arXiv API
+
+```bash
+npm install axios pdf-parse fast-xml-parser
+```
+---
+## Features
+- Query Flexibility: Search using natural language queries or specific arXiv IDs.
+- Full-Document Retrieval: Option to fetch and parse PDFs.
+- Summaries as Documents: Retrieve summaries for faster results.
+- Customizable Options: Configure maximum results and output format.
+
+---
+
+## Getting started
+
+#### Import the path
+```bash
+import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
+```
+---
+
+## Class: ArxivRetriever
+
+### Parameters
+
+| Name              | Type      | Default | Description                                          |
+|-------------------|-----------|---------|------------------------------------------------------|
+| `getFullDocuments` | `boolean` | `false` | Whether to fetch full documents (PDFs) instead of summaries. |
+| `maxSearchResults` | `number`  | `10`    | Maximum number of results to fetch from arXiv.      |
+
+
+
+### Methods
+
+### `_getRelevantDocuments(query: string): Promise<Document[]>`
+
+Fetches documents from arXiv based on the input query.
+
+#### Parameters
+
+| Name   | Type     | Description                            |
+|--------|----------|----------------------------------------|
+| `query` | `string` | A natural language query or arXiv ID. |
+
+#### Returns
+A `Promise` that resolves to an array of LangChain `Document` instances.
+
+#### Example
+```typescript
+const documents = await retriever._getRelevantDocuments("machine learning in climate science");
+console.log(documents);
+```
+---
+
+## Utility Functions
+
+## `isArXivIdentifier(query: string): boolean`
+
+Checks if a query is a valid arXiv ID.
+
+### Parameters
+
+| Name   | Type     | Description                       |
+|--------|----------|-----------------------------------|
+| `query` | `string` | Query to check for arXiv ID format. |
+
+### Returns
+
+`true` if the query is a valid arXiv ID; otherwise, `false`.
+
+
+## `fetchDirectArxivArticle(arxivIds: string): Promise<ArxivEntry[]>`
+
+Fetches arXiv articles using specific arXiv IDs.
+
+### Parameters
+
+| Name       | Type     | Description                           |
+|------------|----------|---------------------------------------|
+| `arxivIds` | `string` | Comma-separated list of arXiv IDs.    |
+
+### Returns
+
+A `Promise` that resolves to an array of `ArxivEntry` objects.
+
+
+## `fetchArxivResultsByQuery(query: string, maxResults: number): Promise<ArxivEntry[]>`
+
+Fetches results from arXiv using a natural language query.
+
+### Parameters
+
+| Name         | Type     | Default | Description                          |
+|--------------|----------|---------|--------------------------------------|
+| `query`      | `string` |         | Search query.                        |
+| `maxResults` | `number` | `10`    | Maximum number of results to fetch.  |
+
+### Returns
+
+A `Promise` that resolves to an array of `ArxivEntry` objects.
+
+
+## `fetchAndParsePDF(pdfUrl: string): Promise<string>`
+
+Fetches a PDF document and parses its content into text.
+
+### Parameters
+
+| Name     | Type     | Description                 |
+|----------|----------|-----------------------------|
+| `pdfUrl` | `string` | URL of the PDF to retrieve. |
+
+### Returns
+
+A `Promise` that resolves to the parsed text of the PDF.
+
+---
+
+## Example
+```bash
+import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
+
+const retriever = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 3,
+});
+
+const documents = await retriever._getRelevantDocuments("neural networks in optimization");
+documents.forEach(doc => {
+    console.log("Title:", doc.metadata.title);
+    console.log("Summary:", doc.pageContent);
+});
+```

From 5b8958f4d2c749ca6217a92b24b2f48c736f2f89 Mon Sep 17 00:00:00 2001
From: Dhruvin Patel <pateldhruvin2503@gmail.com>
Date: Tue, 19 Nov 2024 18:57:41 -0500
Subject: [PATCH 03/16] Edit the documentation for arXIV

---
 .../retrievers/arxiv-retriever.mdx            | 116 ++++--------------
 1 file changed, 23 insertions(+), 93 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index 23a67e7a98df..5009109e3cea 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -1,4 +1,4 @@
-# Documentation for ArxivRetriever in LangChain.js
+# ArxivRetriever in LangChain.js (Docs)
 ---
 
 ## Overview
@@ -7,6 +7,13 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti
 
 ---
 
+## Features
+- Query Flexibility: Search using natural language queries or specific arXiv IDs.
+- Full-Document Retrieval: Option to fetch and parse PDFs.
+- Summaries as Documents: Retrieve summaries for faster results.
+- Customizable Options: Configure maximum results and output format.
+
+---
 ## Installation
 
 Ensure the following dependencies are installed:
@@ -17,21 +24,22 @@ Ensure the following dependencies are installed:
 ```bash
 npm install axios pdf-parse fast-xml-parser
 ```
----
-## Features
-- Query Flexibility: Search using natural language queries or specific arXiv IDs.
-- Full-Document Retrieval: Option to fetch and parse PDFs.
-- Summaries as Documents: Retrieve summaries for faster results.
-- Customizable Options: Configure maximum results and output format.
-
 ---
 
 ## Getting started
 
 #### Import the path
-```bash
+```typescript
 import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
 ```
+
+#### Instantiate the retriever
+```typescript
+const retriever = new ArxivRetriever({
+  getFullDocuments: false, // Set to true to fetch full documents (PDFs)
+  maxSearchResults: 5,     // Maximum number of results to retrieve
+});
+```
 ---
 
 ## Class: ArxivRetriever
@@ -47,9 +55,9 @@ import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
 
 ### Methods
 
-### `_getRelevantDocuments(query: string): Promise<Document[]>`
+### `invoke(query: string): Promise<Document[]>`
 
-Fetches documents from arXiv based on the input query.
+Use the invoke method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
 
 #### Parameters
 
@@ -62,87 +70,9 @@ A `Promise` that resolves to an array of LangChain `Document` instances.
 
 #### Example
 ```typescript
-const documents = await retriever._getRelevantDocuments("machine learning in climate science");
-console.log(documents);
-```
----
-
-## Utility Functions
-
-## `isArXivIdentifier(query: string): boolean`
-
-Checks if a query is a valid arXiv ID.
-
-### Parameters
-
-| Name   | Type     | Description                       |
-|--------|----------|-----------------------------------|
-| `query` | `string` | Query to check for arXiv ID format. |
-
-### Returns
-
-`true` if the query is a valid arXiv ID; otherwise, `false`.
-
-
-## `fetchDirectArxivArticle(arxivIds: string): Promise<ArxivEntry[]>`
-
-Fetches arXiv articles using specific arXiv IDs.
-
-### Parameters
-
-| Name       | Type     | Description                           |
-|------------|----------|---------------------------------------|
-| `arxivIds` | `string` | Comma-separated list of arXiv IDs.    |
-
-### Returns
-
-A `Promise` that resolves to an array of `ArxivEntry` objects.
-
-
-## `fetchArxivResultsByQuery(query: string, maxResults: number): Promise<ArxivEntry[]>`
-
-Fetches results from arXiv using a natural language query.
-
-### Parameters
-
-| Name         | Type     | Default | Description                          |
-|--------------|----------|---------|--------------------------------------|
-| `query`      | `string` |         | Search query.                        |
-| `maxResults` | `number` | `10`    | Maximum number of results to fetch.  |
-
-### Returns
-
-A `Promise` that resolves to an array of `ArxivEntry` objects.
-
-
-## `fetchAndParsePDF(pdfUrl: string): Promise<string>`
-
-Fetches a PDF document and parses its content into text.
-
-### Parameters
-
-| Name     | Type     | Description                 |
-|----------|----------|-----------------------------|
-| `pdfUrl` | `string` | URL of the PDF to retrieve. |
-
-### Returns
-
-A `Promise` that resolves to the parsed text of the PDF.
-
----
-
-## Example
-```bash
-import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
-
-const retriever = new ArxivRetriever({
-    getFullDocuments: false,
-    maxSearchResults: 3,
-});
-
-const documents = await retriever._getRelevantDocuments("neural networks in optimization");
+const documents = await retriever.invoke("quantum computing");
 documents.forEach(doc => {
-    console.log("Title:", doc.metadata.title);
-    console.log("Summary:", doc.pageContent);
+  console.log("Title:", doc.metadata.title);
+  console.log("Content:", doc.pageContent); // Parsed PDF content
 });
-```
+```
\ No newline at end of file

From 47dcac0a4b80a1d8fe96c323cd1a719e46d02ac7 Mon Sep 17 00:00:00 2001
From: Yiran Gogo Yu <gogoyiranyu@gmail.com>
Date: Tue, 19 Nov 2024 23:17:23 -0500
Subject: [PATCH 04/16] Create integration test for Arxiv-Retriever

---
 .../src/retrievers/tests/arxiv.int.test.ts    | 42 +++++++++++++++++++
 libs/langchain-community/src/utils/arxiv.ts   | 20 ++++++---
 2 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts

diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
new file mode 100644
index 000000000000..071bce9d91aa
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
@@ -0,0 +1,42 @@
+import { test, expect } from "@jest/globals";
+import { ArxivRetriever } from "../arxiv.js";
+
+test("ArxivRetriever integration test", async () => {
+    // Sample integration test for ArxivRetriever using the "machine learning" query
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBeGreaterThan(0);
+    expect(results.length).toBeLessThanOrEqual(5);
+
+    for (let i = 0; i < results.length; i += 1) {
+        expect(results[i]).toHaveProperty("pageContent");
+        expect(results[i].pageContent).toBeDefined();
+
+        expect(results[i]).toHaveProperty("id");
+        expect(results[i].id).toBeUndefined();
+
+        expect(results[i]).toHaveProperty("metadata");
+        expect(results[i].metadata).toBeInstanceOf(Object);
+        expect(results[i].metadata).toHaveProperty("authors");
+        expect(results[i].metadata.authors).toBeInstanceOf(Array);
+        expect(results[i].metadata).toHaveProperty("id");
+        expect(results[i].metadata.id).toContain("arxiv.org");
+        expect(results[i].metadata).toHaveProperty("published");
+        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("source");
+        expect(results[i].metadata.source).toBe("arxiv");
+        expect(results[i].metadata).toHaveProperty("title");
+        expect(results[i].metadata).toHaveProperty("updated");
+        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("url");
+        expect(results[i].metadata.url).toContain("arxiv.org");
+    }
+});
\ No newline at end of file
diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
index e62e228876e2..74439e45c69a 100644
--- a/libs/langchain-community/src/utils/arxiv.ts
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -1,5 +1,5 @@
 import axios from 'axios'; // For HTTP requests
-import pdfParse from 'pdf-parse'; // For parsing PDFs
+import { PDFLoader } from "../document_loaders/fs/pdf.js";
 import { XMLParser } from 'fast-xml-parser'; // For parsing XML
 import { Document } from "@langchain/core/documents";
 
@@ -97,10 +97,20 @@ export async function searchArxiv(query: string, maxResults = 3): Promise<ArxivE
 // Used to fetch and parse PDF to text
 export async function fetchAndParsePDF(pdfUrl: string): Promise<string> {
     try {
-        const response = await axios.get(pdfUrl, { responseType: 'arraybuffer' });
+        // Fetch the PDF as an array buffer
+        const response = await axios.get(pdfUrl, { responseType: "arraybuffer" });
         const buffer = Buffer.from(response.data);
-        const data = await pdfParse(buffer);
-        return data.text;
+
+        // Convert the Buffer to a Blob
+        const blob = new Blob([buffer], { type: "application/pdf" });
+
+        // Use PDFLoader to process the PDF
+        const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob
+        const docs: Document[] = await loader.load();
+
+        // Combine all document content into a single string
+        const content = docs.map((doc) => doc.pageContent).join("\n\n");
+        return content;
     } catch (error) {
         throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);
     }
@@ -198,4 +208,4 @@ function processEntry(entry: any): ArxivEntry {
         pdfUrl,
         links,
     };
-}
+}
\ No newline at end of file

From f00deda9b14bf5085572eceb527fba565c2393d2 Mon Sep 17 00:00:00 2001
From: Yiran Gogo Yu <gogoyiranyu@gmail.com>
Date: Wed, 20 Nov 2024 20:45:55 -0500
Subject: [PATCH 05/16] Update integration test for arxiv retriever

---
 .../src/retrievers/tests/arxiv.int.test.ts    | 300 +++++++++++++++++-
 1 file changed, 298 insertions(+), 2 deletions(-)

diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
index 071bce9d91aa..11eb1040ed18 100644
--- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
+++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
@@ -1,7 +1,7 @@
 import { test, expect } from "@jest/globals";
 import { ArxivRetriever } from "../arxiv.js";
 
-test("ArxivRetriever integration test", async () => {
+test("ArxivRetriever fetching document summaries test", async () => {
     // Sample integration test for ArxivRetriever using the "machine learning" query
     const retriever = new ArxivRetriever(
         {
@@ -16,12 +16,116 @@ test("ArxivRetriever integration test", async () => {
     expect(results.length).toBeGreaterThan(0);
     expect(results.length).toBeLessThanOrEqual(5);
 
+    for (let i = 0; i < results.length; i += 1) {
+        expect(results[i]).toHaveProperty("pageContent");
+        expect(results[i].pageContent).toBeDefined();
+
+        expect(results[i]).toHaveProperty("metadata");
+        expect(results[i].metadata).toBeInstanceOf(Object);
+        expect(results[i].metadata).toHaveProperty("authors");
+        expect(results[i].metadata.authors).toBeInstanceOf(Array);
+        expect(results[i].metadata).toHaveProperty("id");
+        expect(results[i].metadata.id).toContain("arxiv.org");
+        expect(results[i].metadata).toHaveProperty("published");
+        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("source");
+        expect(results[i].metadata.source).toBe("arxiv");
+        expect(results[i].metadata).toHaveProperty("title");
+        expect(results[i].metadata).toHaveProperty("updated");
+        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("url");
+        expect(results[i].metadata.url).toContain("arxiv.org");
+    }
+});
+
+test("ArxivRetriever fetching document summaries with invalid query test", async () => {
+    // Sample test for ArxivRetriever using an invalid query
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const query = "fjalsdkjfw";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching document summaries with empty query test", async () => {
+    // Sample test for ArxivRetriever using an empty query
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const query = "";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => {
+    // Sample test for ArxivRetriever using an invalid maxSearchResults
+    try {
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: true,
+                maxSearchResults: -1
+            }
+        );
+        const query = "machine learning";
+        const results = await retriever._getRelevantDocuments(query);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => {
+    // Sample test for ArxivRetriever using an zero maxSearchResults
+    try {
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: true,
+                maxSearchResults: 0
+            }
+        );
+        const query = "machine learning";
+        const results = await retriever._getRelevantDocuments(query);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever fetching full documents test", async () => {
+    // Sample test for fetching full documents with ArxivRetriever
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: true,
+            maxSearchResults: 5
+        }
+    );
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBeGreaterThan(0);
+    expect(results.length).toBeLessThanOrEqual(5);
+
     for (let i = 0; i < results.length; i += 1) {
         expect(results[i]).toHaveProperty("pageContent");
         expect(results[i].pageContent).toBeDefined();
 
         expect(results[i]).toHaveProperty("id");
-        expect(results[i].id).toBeUndefined();
 
         expect(results[i]).toHaveProperty("metadata");
         expect(results[i].metadata).toBeInstanceOf(Object);
@@ -38,5 +142,197 @@ test("ArxivRetriever integration test", async () => {
         expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
         expect(results[i].metadata).toHaveProperty("url");
         expect(results[i].metadata.url).toContain("arxiv.org");
+        expect(results[i].metadata).toHaveProperty("summary");
+    }
+});
+
+test("ArxivRetriever fetching full documents with invalid query test", async () => {
+    // Sample test for fetching full documents with ArxivRetriever using an invalid query
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: true,
+            maxSearchResults: 5
+        }
+    );
+    const query = "fjalsdkjfw";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching full documents with empty query test", async () => {
+    // Sample test for fetching full documents with ArxivRetriever using an empty query
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: true,
+            maxSearchResults: 5
+        }
+    );
+    const query = "";
+    const results = await retriever._getRelevantDocuments(query);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => {
+    // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults
+    try {
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: true,
+                maxSearchResults: -1
+            }
+        );
+        const query = "machine learning";
+        const results = await retriever._getRelevantDocuments(query);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => {
+    // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults
+    try {
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: true,
+                maxSearchResults: 0
+            }
+        );
+        const query = "machine learning";
+        const results = await retriever._getRelevantDocuments(query);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever search articles by id test", async () => {
+    // Sample test for fetching articles by arXiv IDs
+    const fetchIds = "2103.03404 2103.03405";
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const results = await retriever.invoke(fetchIds);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(2);
+
+    for (let i = 0; i < results.length; i += 1) {
+        expect(results[i]).toHaveProperty("pageContent");
+        expect(results[i].pageContent).toBeDefined();
+
+        expect(results[i]).toHaveProperty("metadata");
+        expect(results[i].metadata).toBeInstanceOf(Object);
+        expect(results[i].metadata).toHaveProperty("authors");
+        expect(results[i].metadata.authors).toBeInstanceOf(Array);
+        expect(results[i].metadata).toHaveProperty("id");
+        expect(results[i].metadata.id).toContain("arxiv.org");
+        expect(results[i].metadata).toHaveProperty("published");
+        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("source");
+        expect(results[i].metadata.source).toBe("arxiv");
+        expect(results[i].metadata).toHaveProperty("title");
+        expect(results[i].metadata).toHaveProperty("updated");
+        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
+        expect(results[i].metadata).toHaveProperty("url");
+        expect(results[i].metadata.url).toContain("arxiv.org");
+    }
+});
+
+test("ArxivRetriever search articles by id with invalid id test", async () => {
+    // Sample test for fetching articles by arXiv IDs with an invalid ID
+    const fetchIds = "2103.03404 2103.03405 1234.56789";
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const results = await retriever.invoke(fetchIds);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBeLessThan(3);
+});
+
+test("ArxivRetriever search articles by id with empty id test", async () => {
+    // Sample test for fetching articles by arXiv IDs with an empty ID
+    const fetchIds = "";
+    const retriever = new ArxivRetriever(
+        {
+            getFullDocuments: false,
+            maxSearchResults: 5
+        }
+    );
+    const results = await retriever.invoke(fetchIds);
+
+    expect(results).toBeDefined();
+    expect(results.length).toBe(0);
+});
+
+test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => {
+    // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults
+    try {
+        const fetchIds = "2103.03404 2103.03405";
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: false,
+                maxSearchResults: -1
+            }
+        );
+        const results = await retriever.invoke(fetchIds);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => {
+    // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults
+    try {
+        const fetchIds = "2103.03404 2103.03405 1234.56789";
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: false,
+                maxSearchResults: -1
+            }
+        );
+        const results = await retriever.invoke(fetchIds);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
+    }
+});
+
+test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => {
+    // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults
+    try {
+        const fetchIds = "2103.03404 2103.03405 1234.56789";
+        const retriever = new ArxivRetriever(
+            {
+                getFullDocuments: false,
+                maxSearchResults: 0
+            }
+        );
+        const results = await retriever.invoke(fetchIds);
+        expect(results).toBeUndefined();
+        expect(results.length).toBe(0);
+    } catch (error) {
+        expect(error).toBeDefined();
+        expect(error).toBeInstanceOf(Error);
     }
 });
\ No newline at end of file

From e52a6e12ff4efb253e6e8e7b3d4da3efaace8be9 Mon Sep 17 00:00:00 2001
From: boni-teppanyaki <hepintao81620@163.com>
Date: Fri, 22 Nov 2024 19:45:50 -0500
Subject: [PATCH 06/16] Add example usage file for arxiv retriever

---
 examples/src/retrievers/arxiv.ts | 67 ++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 examples/src/retrievers/arxiv.ts

diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts
new file mode 100644
index 000000000000..3c164844474e
--- /dev/null
+++ b/examples/src/retrievers/arxiv.ts
@@ -0,0 +1,67 @@
+import { ArxivRetriever } from "../../../libs/langchain-community/src/retrievers/arxiv.js";
+
+export const run = async () => {
+  /*
+    Direct look up by arXiv ID, for full texts
+  */
+
+  const queryId = "1605.08386 2103.03404";
+  const retrieverById = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5
+  });
+  const documentsById = await retrieverById.invoke(queryId);
+  console.log(documentsById);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata: 
+      {
+        author,
+        id,
+        published,
+        source,
+        updated,
+        url
+      }
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+
+  /*
+  Search with natural language query, for summaries
+  */
+
+  const queryNat = "What is the ImageBind model?";
+  const retrieverByNat = new ArxivRetriever(
+    {
+      getFullDocuments: false,
+      maxSearchResults: 2
+    }
+  );
+  const documentsByQuery = await retrieverByNat.invoke(queryNat);
+  console.log(documentsByQuery);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+};
\ No newline at end of file

From caa109c498c4b927ae160fee8a910aa9687352ad Mon Sep 17 00:00:00 2001
From: Yiran Gogo Yu <gogoyiranyu@gmail.com>
Date: Thu, 12 Dec 2024 16:54:24 -0500
Subject: [PATCH 07/16] Updated file to use fetch() instead of axios.get()

1. Removed the import axios line in src/utils/arxiv.ts
2. Removed the dependencies to use axios in "ArxivRetriever in LangChain.js (Docs)" file
---
 .../retrievers/arxiv-retriever.mdx            |  3 +-
 libs/langchain-community/src/utils/arxiv.ts   | 32 +++++++++++++------
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index 5009109e3cea..f82b5a76237e 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -17,12 +17,11 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti
 ## Installation
 
 Ensure the following dependencies are installed:
-- `axios` for making HTTP requests
 - `pdf-parse` for parsing PDFs
 - `fast-xml-parser` for parsing XML responses from the arXiv API
 
 ```bash
-npm install axios pdf-parse fast-xml-parser
+npm install pdf-parse fast-xml-parser
 ```
 ---
 
diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
index 74439e45c69a..c4ee12146bcf 100644
--- a/libs/langchain-community/src/utils/arxiv.ts
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -1,4 +1,3 @@
-import axios from 'axios'; // For HTTP requests
 import { PDFLoader } from "../document_loaders/fs/pdf.js";
 import { XMLParser } from 'fast-xml-parser'; // For parsing XML
 import { Document } from "@langchain/core/documents";
@@ -26,8 +25,13 @@ export async function fetchDirectArxivArticle(arxivIds: string): Promise<ArxivEn
     try {
         const idList = arxivIds.split(/[\s,]+/).map(id => id.trim()).filter(Boolean).join(',');
         const url = `http://export.arxiv.org/api/query?id_list=${idList}`;
-        const response = await axios.get(url);
-        const xml = response.data;
+        const response = await fetch(url);
+        
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        
+        const xml = await response.text();
 
         const parser = new XMLParser({
             ignoreAttributes: false,
@@ -58,8 +62,13 @@ export async function fetchArxivResultsByQuery(query: string, start = 0, maxResu
     try {
         const encodedQuery = encodeURIComponent(query);
         const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;
-        const response = await axios.get(url);
-        const xml = response.data;
+        const response = await fetch(url);
+        
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        
+        const xml = await response.text();
 
         const parser = new XMLParser({
             ignoreAttributes: false,
@@ -97,11 +106,16 @@ export async function searchArxiv(query: string, maxResults = 3): Promise<ArxivE
 // Used to fetch and parse PDF to text
 export async function fetchAndParsePDF(pdfUrl: string): Promise<string> {
     try {
-        // Fetch the PDF as an array buffer
-        const response = await axios.get(pdfUrl, { responseType: "arraybuffer" });
-        const buffer = Buffer.from(response.data);
+        // Fetch the PDF
+        const response = await fetch(pdfUrl);
+        
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        
+        const buffer = await response.arrayBuffer();
 
-        // Convert the Buffer to a Blob
+        // Convert the ArrayBuffer to a Blob
         const blob = new Blob([buffer], { type: "application/pdf" });
 
         // Use PDFLoader to process the PDF

From 55eb7396a96de688857e837188e5f54f14c2ee92 Mon Sep 17 00:00:00 2001
From: Dhruvin Patel <pateldhruvin2503@gmail.com>
Date: Thu, 12 Dec 2024 20:34:53 -0500
Subject: [PATCH 08/16] Final changes to docs

---
 .../retrievers/arxiv-retriever.mdx            | 91 ++++++++++++++++---
 1 file changed, 80 insertions(+), 11 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index f82b5a76237e..1395fc10d459 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -3,7 +3,7 @@
 
 ## Overview
 
-The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.
+The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
 
 ---
 
@@ -14,7 +14,16 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti
 - Customizable Options: Configure maximum results and output format.
 
 ---
-## Installation
+
+## Integration details
+
+| Retriever        | Source                       | Package                                 |
+| ---------------- | ---------------------------- | --------------------------------------- |
+| `ArxivRetriever` | Academic articles from arXiv | `@langchain-community/retrievers/arxiv` |
+
+---
+
+## Setup/Installation
 
 Ensure the following dependencies are installed:
 - `pdf-parse` for parsing PDFs
@@ -25,20 +34,71 @@ npm install pdf-parse fast-xml-parser
 ```
 ---
 
-## Getting started
 
-#### Import the path
-```typescript
-import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js";
-```
-
-#### Instantiate the retriever
+## Instantiate the retriever
 ```typescript
 const retriever = new ArxivRetriever({
   getFullDocuments: false, // Set to true to fetch full documents (PDFs)
   maxSearchResults: 5,     // Maximum number of results to retrieve
 });
 ```
+---
+## Usage
+
+Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
+
+```typescript
+const query = "quantum computing";
+
+const documents = await retriever.invoke(query);
+documents.forEach(doc => {
+  console.log("Title:", doc.metadata.title);
+  console.log("Content:", doc.pageContent); // Parsed PDF content
+});
+```
+
+---
+
+## Use within a chain
+
+Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain:
+
+```typescript
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { RunnablePassthrough, RunnableSequence } from "@langchain/core/runnables";
+import { StringOutputParser } from "@langchain/core/output_parsers";
+import type { Document } from "@langchain/core/documents";
+
+const llm = new ChatOpenAI({
+  model: "gpt-4o-mini",
+  temperature: 0,
+});
+
+const prompt = ChatPromptTemplate.fromTemplate(`
+Answer the question based only on the context provided.
+
+Context: {context}
+
+Question: {question}`);
+
+const formatDocs = (docs: Document[]) => {
+  return docs.map((doc) => doc.pageContent).join("\n\n");
+};
+
+const ragChain = RunnableSequence.from([
+  {
+    context: retriever.pipe(formatDocs),
+    question: new RunnablePassthrough(),
+  },
+  prompt,
+  llm,
+  new StringOutputParser(),
+]);
+
+await ragChain.invoke("What are the latest advances in quantum computing?");
+```
+
 ---
 
 ## Class: ArxivRetriever
@@ -51,7 +111,6 @@ const retriever = new ArxivRetriever({
 | `maxSearchResults` | `number`  | `10`    | Maximum number of results to fetch from arXiv.      |
 
 
-
 ### Methods
 
 ### `invoke(query: string): Promise<Document[]>`
@@ -74,4 +133,14 @@ documents.forEach(doc => {
   console.log("Title:", doc.metadata.title);
   console.log("Content:", doc.pageContent); // Parsed PDF content
 });
-```
\ No newline at end of file
+```
+
+For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
+
+
+
+
+
+
+
+

From 3ae9fc9484ccc9726afdc6b1a6c3ee258f5fca26 Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 17:42:26 -0800
Subject: [PATCH 09/16] Update arxiv-retriever.mdx

---
 .../retrievers/arxiv-retriever.mdx            | 63 ++-----------------
 1 file changed, 6 insertions(+), 57 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index 1395fc10d459..fdf4804b388a 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -1,27 +1,21 @@
-# ArxivRetriever in LangChain.js (Docs)
+# ArxivRetriever
 ---
 
 ## Overview
 
 The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
 
----
-
 ## Features
 - Query Flexibility: Search using natural language queries or specific arXiv IDs.
 - Full-Document Retrieval: Option to fetch and parse PDFs.
 - Summaries as Documents: Retrieve summaries for faster results.
 - Customizable Options: Configure maximum results and output format.
 
----
-
 ## Integration details
 
 | Retriever        | Source                       | Package                                 |
 | ---------------- | ---------------------------- | --------------------------------------- |
-| `ArxivRetriever` | Academic articles from arXiv | `@langchain-community/retrievers/arxiv` |
-
----
+| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |
 
 ## Setup/Installation
 
@@ -29,20 +23,21 @@ Ensure the following dependencies are installed:
 - `pdf-parse` for parsing PDFs
 - `fast-xml-parser` for parsing XML responses from the arXiv API
 
-```bash
+```npm2yarn
 npm install pdf-parse fast-xml-parser
 ```
 ---
 
 
 ## Instantiate the retriever
+
 ```typescript
 const retriever = new ArxivRetriever({
   getFullDocuments: false, // Set to true to fetch full documents (PDFs)
   maxSearchResults: 5,     // Maximum number of results to retrieve
 });
 ```
----
+
 ## Usage
 
 Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
@@ -57,8 +52,6 @@ documents.forEach(doc => {
 });
 ```
 
----
-
 ## Use within a chain
 
 Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain:
@@ -99,48 +92,4 @@ const ragChain = RunnableSequence.from([
 await ragChain.invoke("What are the latest advances in quantum computing?");
 ```
 
----
-
-## Class: ArxivRetriever
-
-### Parameters
-
-| Name              | Type      | Default | Description                                          |
-|-------------------|-----------|---------|------------------------------------------------------|
-| `getFullDocuments` | `boolean` | `false` | Whether to fetch full documents (PDFs) instead of summaries. |
-| `maxSearchResults` | `number`  | `10`    | Maximum number of results to fetch from arXiv.      |
-
-
-### Methods
-
-### `invoke(query: string): Promise<Document[]>`
-
-Use the invoke method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
-
-#### Parameters
-
-| Name   | Type     | Description                            |
-|--------|----------|----------------------------------------|
-| `query` | `string` | A natural language query or arXiv ID. |
-
-#### Returns
-A `Promise` that resolves to an array of LangChain `Document` instances.
-
-#### Example
-```typescript
-const documents = await retriever.invoke("quantum computing");
-documents.forEach(doc => {
-  console.log("Title:", doc.metadata.title);
-  console.log("Content:", doc.pageContent); // Parsed PDF content
-});
-```
-
-For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
-
-
-
-
-
-
-
-
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)

From 58931bf0f49f2e59f5cd089b1cae967098654d26 Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 17:53:43 -0800
Subject: [PATCH 10/16] Format, rename, fix docs

---
 .../retrievers/arxiv-retriever.mdx            |  20 +-
 examples/src/retrievers/arxiv.ts              |  18 +-
 libs/langchain-community/langchain.config.js  |   1 +
 .../src/load/import_constants.ts              |   1 +
 .../src/load/import_map.ts                    |   1 -
 .../src/retrievers/arxiv.ts                   |  62 +-
 .../src/retrievers/tests/arxiv.int.test.ts    | 552 +++++++++---------
 libs/langchain-community/src/utils/arxiv.ts   | 394 +++++++------
 8 files changed, 527 insertions(+), 522 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index fdf4804b388a..fff4da2a0a2d 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -1,4 +1,5 @@
 # ArxivRetriever
+
 ---
 
 ## Overview
@@ -6,6 +7,7 @@
 The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
 
 ## Features
+
 - Query Flexibility: Search using natural language queries or specific arXiv IDs.
 - Full-Document Retrieval: Option to fetch and parse PDFs.
 - Summaries as Documents: Retrieve summaries for faster results.
@@ -13,28 +15,27 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti
 
 ## Integration details
 
-| Retriever        | Source                       | Package                                 |
-| ---------------- | ---------------------------- | --------------------------------------- |
+| Retriever        | Source                       | Package                                                                      |
+| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
 | `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |
 
 ## Setup/Installation
 
 Ensure the following dependencies are installed:
+
 - `pdf-parse` for parsing PDFs
 - `fast-xml-parser` for parsing XML responses from the arXiv API
 
 ```npm2yarn
 npm install pdf-parse fast-xml-parser
 ```
----
-
 
 ## Instantiate the retriever
 
 ```typescript
 const retriever = new ArxivRetriever({
-  getFullDocuments: false, // Set to true to fetch full documents (PDFs)
-  maxSearchResults: 5,     // Maximum number of results to retrieve
+  returnFullDocuments: false, // Set to true to fetch full documents (PDFs)
+  maxSearchResults: 5, // Maximum number of results to retrieve
 });
 ```
 
@@ -46,7 +47,7 @@ Use the `invoke` method to search arXiv for relevant articles. You can use eithe
 const query = "quantum computing";
 
 const documents = await retriever.invoke(query);
-documents.forEach(doc => {
+documents.forEach((doc) => {
   console.log("Title:", doc.metadata.title);
   console.log("Content:", doc.pageContent); // Parsed PDF content
 });
@@ -59,7 +60,10 @@ Like other retrievers, `ArxivRetriever` can be incorporated into LLM application
 ```typescript
 import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
-import { RunnablePassthrough, RunnableSequence } from "@langchain/core/runnables";
+import {
+  RunnablePassthrough,
+  RunnableSequence,
+} from "@langchain/core/runnables";
 import { StringOutputParser } from "@langchain/core/output_parsers";
 import type { Document } from "@langchain/core/documents";
 
diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts
index 3c164844474e..fe4a4346371c 100644
--- a/examples/src/retrievers/arxiv.ts
+++ b/examples/src/retrievers/arxiv.ts
@@ -1,4 +1,4 @@
-import { ArxivRetriever } from "../../../libs/langchain-community/src/retrievers/arxiv.js";
+import { ArxivRetriever } from "@langchain/community/retrievers/arxiv";
 
 export const run = async () => {
   /*
@@ -7,8 +7,8 @@ export const run = async () => {
 
   const queryId = "1605.08386 2103.03404";
   const retrieverById = new ArxivRetriever({
-    getFullDocuments: true,
-    maxSearchResults: 5
+    returnFullDocuments: true,
+    maxSearchResults: 5,
   });
   const documentsById = await retrieverById.invoke(queryId);
   console.log(documentsById);
@@ -41,12 +41,10 @@ export const run = async () => {
   */
 
   const queryNat = "What is the ImageBind model?";
-  const retrieverByNat = new ArxivRetriever(
-    {
-      getFullDocuments: false,
-      maxSearchResults: 2
-    }
-  );
+  const retrieverByNat = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 2,
+  });
   const documentsByQuery = await retrieverByNat.invoke(queryNat);
   console.log(documentsByQuery);
 
@@ -64,4 +62,4 @@ export const run = async () => {
     }
   ]
   */
-};
\ No newline at end of file
+};
diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
index 09d2baf3cab6..f0c1914d5e78 100644
--- a/libs/langchain-community/langchain.config.js
+++ b/libs/langchain-community/langchain.config.js
@@ -438,6 +438,7 @@ export const config = {
     "chat_models/zhipuai",
     "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv",
     "retrievers/dria",
     "retrievers/metal",
     "retrievers/supabase",
diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts
index 5930f82690db..014d418e872d 100644
--- a/libs/langchain-community/src/load/import_constants.ts
+++ b/libs/langchain-community/src/load/import_constants.ts
@@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [
   "langchain_community/callbacks/handlers/upstash_ratelimit",
   "langchain_community/retrievers/amazon_kendra",
   "langchain_community/retrievers/amazon_knowledge_base",
+  "langchain_community/retrievers/arxiv",
   "langchain_community/retrievers/dria",
   "langchain_community/retrievers/metal",
   "langchain_community/retrievers/supabase",
diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts
index 3a96bdf2cd8c..2ec7b20bc542 100644
--- a/libs/langchain-community/src/load/import_map.ts
+++ b/libs/langchain-community/src/load/import_map.ts
@@ -57,7 +57,6 @@ export * as chat_models__novita from "../chat_models/novita.js";
 export * as chat_models__ollama from "../chat_models/ollama.js";
 export * as chat_models__togetherai from "../chat_models/togetherai.js";
 export * as chat_models__yandex from "../chat_models/yandex.js";
-export * as retrievers__arxiv from "../retrievers/arxiv.js";
 export * as retrievers__bm25 from "../retrievers/bm25.js";
 export * as retrievers__chaindesk from "../retrievers/chaindesk.js";
 export * as retrievers__databerry from "../retrievers/databerry.js";
diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts
index b8bb5a524eea..2eb0ab6c5675 100644
--- a/libs/langchain-community/src/retrievers/arxiv.ts
+++ b/libs/langchain-community/src/retrievers/arxiv.ts
@@ -1,10 +1,14 @@
 import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
 import { Document } from "@langchain/core/documents";
-import { searchArxiv, loadDocsFromResults, getDocsFromSummaries } from '../utils/arxiv.js';
+import {
+  searchArxiv,
+  loadDocsFromResults,
+  getDocsFromSummaries,
+} from "../utils/arxiv.js";
 
 export type ArxivRetrieverOptions = {
-    getFullDocuments?: boolean;
-    maxSearchResults?: number;
+  returnFullDocuments?: boolean;
+  maxSearchResults?: number;
 } & BaseRetrieverInput;
 
 /**
@@ -12,34 +16,36 @@ export type ArxivRetrieverOptions = {
  * It can retrieve either full documents (PDFs) or just summaries.
  */
 export class ArxivRetriever extends BaseRetriever {
-    static lc_name() {
-        return "ArxivRetriever";
-    }
+  static lc_name() {
+    return "ArxivRetriever";
+  }
 
-    lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
+  lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
 
-    getFullDocuments: boolean;
-    maxSearchResults: number;
+  returnFullDocuments = false;
 
-    constructor(options: ArxivRetrieverOptions = {}) {
-        super(options);
-        this.getFullDocuments = options.getFullDocuments ?? false;
-        this.maxSearchResults = options.maxSearchResults ?? 10;
-    }
+  maxSearchResults = 10;
+
+  constructor(options: ArxivRetrieverOptions = {}) {
+    super(options);
+    this.returnFullDocuments =
+      options.returnFullDocuments ?? this.returnFullDocuments;
+    this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults;
+  }
+
+  async _getRelevantDocuments(query: string): Promise<Document[]> {
+    try {
+      const results = await searchArxiv(query, this.maxSearchResults);
 
-    async _getRelevantDocuments(query: string): Promise<Document[]> {
-        try {
-            const results = await searchArxiv(query, this.maxSearchResults);
-
-            if (this.getFullDocuments) {
-                // Fetch and parse PDFs to get full documents
-                return await loadDocsFromResults(results);
-            } else {
-                // Use summaries as documents
-                return getDocsFromSummaries(results);
-            }
-        } catch (error) {
-            throw new Error(`Error retrieving documents from arXiv.`);
-        }
+      if (this.returnFullDocuments) {
+        // Fetch and parse PDFs to get full documents
+        return await loadDocsFromResults(results);
+      } else {
+        // Use summaries as documents
+        return getDocsFromSummaries(results);
+      }
+    } catch (error) {
+      throw new Error(`Error retrieving documents from arXiv.`);
     }
+  }
 }
diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
index 11eb1040ed18..2d8467fe1f57 100644
--- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
+++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
@@ -2,337 +2,317 @@ import { test, expect } from "@jest/globals";
 import { ArxivRetriever } from "../arxiv.js";
 
 test("ArxivRetriever fetching document summaries test", async () => {
-    // Sample integration test for ArxivRetriever using the "machine learning" query
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
+  // Sample integration test for ArxivRetriever using the "machine learning" query
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "machine learning";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeGreaterThan(0);
+  expect(results.length).toBeLessThanOrEqual(5);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
     );
-    const query = "machine learning";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBeGreaterThan(0);
-    expect(results.length).toBeLessThanOrEqual(5);
-
-    for (let i = 0; i < results.length; i += 1) {
-        expect(results[i]).toHaveProperty("pageContent");
-        expect(results[i].pageContent).toBeDefined();
-
-        expect(results[i]).toHaveProperty("metadata");
-        expect(results[i].metadata).toBeInstanceOf(Object);
-        expect(results[i].metadata).toHaveProperty("authors");
-        expect(results[i].metadata.authors).toBeInstanceOf(Array);
-        expect(results[i].metadata).toHaveProperty("id");
-        expect(results[i].metadata.id).toContain("arxiv.org");
-        expect(results[i].metadata).toHaveProperty("published");
-        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("source");
-        expect(results[i].metadata.source).toBe("arxiv");
-        expect(results[i].metadata).toHaveProperty("title");
-        expect(results[i].metadata).toHaveProperty("updated");
-        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("url");
-        expect(results[i].metadata.url).toContain("arxiv.org");
-    }
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+  }
 });
 
 test("ArxivRetriever fetching document summaries with invalid query test", async () => {
-    // Sample test for ArxivRetriever using an invalid query
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
-    );
-    const query = "fjalsdkjfw";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(0);
+  // Sample test for ArxivRetriever using an invalid query
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "fjalsdkjfw";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
 });
 
 test("ArxivRetriever fetching document summaries with empty query test", async () => {
-    // Sample test for ArxivRetriever using an empty query
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
-    );
-    const query = "";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(0);
+  // Sample test for ArxivRetriever using an empty query
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const query = "";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
 });
 
 test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => {
-    // Sample test for ArxivRetriever using an invalid maxSearchResults
-    try {
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: true,
-                maxSearchResults: -1
-            }
-        );
-        const query = "machine learning";
-        const results = await retriever._getRelevantDocuments(query);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for ArxivRetriever using an invalid maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: true,
+      maxSearchResults: -1,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => {
-    // Sample test for ArxivRetriever using an zero maxSearchResults
-    try {
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: true,
-                maxSearchResults: 0
-            }
-        );
-        const query = "machine learning";
-        const results = await retriever._getRelevantDocuments(query);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for ArxivRetriever using an zero maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: true,
+      maxSearchResults: 0,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever fetching full documents test", async () => {
-    // Sample test for fetching full documents with ArxivRetriever
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: true,
-            maxSearchResults: 5
-        }
+  // Sample test for fetching full documents with ArxivRetriever
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "machine learning";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeGreaterThan(0);
+  expect(results.length).toBeLessThanOrEqual(5);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("id");
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
     );
-    const query = "machine learning";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBeGreaterThan(0);
-    expect(results.length).toBeLessThanOrEqual(5);
-
-    for (let i = 0; i < results.length; i += 1) {
-        expect(results[i]).toHaveProperty("pageContent");
-        expect(results[i].pageContent).toBeDefined();
-
-        expect(results[i]).toHaveProperty("id");
-
-        expect(results[i]).toHaveProperty("metadata");
-        expect(results[i].metadata).toBeInstanceOf(Object);
-        expect(results[i].metadata).toHaveProperty("authors");
-        expect(results[i].metadata.authors).toBeInstanceOf(Array);
-        expect(results[i].metadata).toHaveProperty("id");
-        expect(results[i].metadata.id).toContain("arxiv.org");
-        expect(results[i].metadata).toHaveProperty("published");
-        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("source");
-        expect(results[i].metadata.source).toBe("arxiv");
-        expect(results[i].metadata).toHaveProperty("title");
-        expect(results[i].metadata).toHaveProperty("updated");
-        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("url");
-        expect(results[i].metadata.url).toContain("arxiv.org");
-        expect(results[i].metadata).toHaveProperty("summary");
-    }
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("summary");
+  }
 });
 
 test("ArxivRetriever fetching full documents with invalid query test", async () => {
-    // Sample test for fetching full documents with ArxivRetriever using an invalid query
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: true,
-            maxSearchResults: 5
-        }
-    );
-    const query = "fjalsdkjfw";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(0);
+  // Sample test for fetching full documents with ArxivRetriever using an invalid query
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "fjalsdkjfw";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
 });
 
 test("ArxivRetriever fetching full documents with empty query test", async () => {
-    // Sample test for fetching full documents with ArxivRetriever using an empty query
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: true,
-            maxSearchResults: 5
-        }
-    );
-    const query = "";
-    const results = await retriever._getRelevantDocuments(query);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(0);
+  // Sample test for fetching full documents with ArxivRetriever using an empty query
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const query = "";
+  const results = await retriever._getRelevantDocuments(query);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
 });
 
 test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => {
-    // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults
-    try {
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: true,
-                maxSearchResults: -1
-            }
-        );
-        const query = "machine learning";
-        const results = await retriever._getRelevantDocuments(query);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: true,
+      maxSearchResults: -1,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => {
-    // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults
-    try {
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: true,
-                maxSearchResults: 0
-            }
-        );
-        const query = "machine learning";
-        const results = await retriever._getRelevantDocuments(query);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults
+  try {
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: true,
+      maxSearchResults: 0,
+    });
+    const query = "machine learning";
+    const results = await retriever._getRelevantDocuments(query);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever search articles by id test", async () => {
-    // Sample test for fetching articles by arXiv IDs
-    const fetchIds = "2103.03404 2103.03405";
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
+  // Sample test for fetching articles by arXiv IDs
+  const fetchIds = "2103.03404 2103.03405";
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(2);
+
+  for (let i = 0; i < results.length; i += 1) {
+    expect(results[i]).toHaveProperty("pageContent");
+    expect(results[i].pageContent).toBeDefined();
+
+    expect(results[i]).toHaveProperty("metadata");
+    expect(results[i].metadata).toBeInstanceOf(Object);
+    expect(results[i].metadata).toHaveProperty("authors");
+    expect(results[i].metadata.authors).toBeInstanceOf(Array);
+    expect(results[i].metadata).toHaveProperty("id");
+    expect(results[i].metadata.id).toContain("arxiv.org");
+    expect(results[i].metadata).toHaveProperty("published");
+    expect(results[i].metadata.published).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
     );
-    const results = await retriever.invoke(fetchIds);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(2);
-
-    for (let i = 0; i < results.length; i += 1) {
-        expect(results[i]).toHaveProperty("pageContent");
-        expect(results[i].pageContent).toBeDefined();
-
-        expect(results[i]).toHaveProperty("metadata");
-        expect(results[i].metadata).toBeInstanceOf(Object);
-        expect(results[i].metadata).toHaveProperty("authors");
-        expect(results[i].metadata.authors).toBeInstanceOf(Array);
-        expect(results[i].metadata).toHaveProperty("id");
-        expect(results[i].metadata.id).toContain("arxiv.org");
-        expect(results[i].metadata).toHaveProperty("published");
-        expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("source");
-        expect(results[i].metadata.source).toBe("arxiv");
-        expect(results[i].metadata).toHaveProperty("title");
-        expect(results[i].metadata).toHaveProperty("updated");
-        expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
-        expect(results[i].metadata).toHaveProperty("url");
-        expect(results[i].metadata.url).toContain("arxiv.org");
-    }
+    expect(results[i].metadata).toHaveProperty("source");
+    expect(results[i].metadata.source).toBe("arxiv");
+    expect(results[i].metadata).toHaveProperty("title");
+    expect(results[i].metadata).toHaveProperty("updated");
+    expect(results[i].metadata.updated).toMatch(
+      /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/
+    );
+    expect(results[i].metadata).toHaveProperty("url");
+    expect(results[i].metadata.url).toContain("arxiv.org");
+  }
 });
 
 test("ArxivRetriever search articles by id with invalid id test", async () => {
-    // Sample test for fetching articles by arXiv IDs with an invalid ID
-    const fetchIds = "2103.03404 2103.03405 1234.56789";
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
-    );
-    const results = await retriever.invoke(fetchIds);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBeLessThan(3);
+  // Sample test for fetching articles by arXiv IDs with an invalid ID
+  const fetchIds = "2103.03404 2103.03405 1234.56789";
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBeLessThan(3);
 });
 
 test("ArxivRetriever search articles by id with empty id test", async () => {
-    // Sample test for fetching articles by arXiv IDs with an empty ID
-    const fetchIds = "";
-    const retriever = new ArxivRetriever(
-        {
-            getFullDocuments: false,
-            maxSearchResults: 5
-        }
-    );
-    const results = await retriever.invoke(fetchIds);
-
-    expect(results).toBeDefined();
-    expect(results.length).toBe(0);
+  // Sample test for fetching articles by arXiv IDs with an empty ID
+  const fetchIds = "";
+  const retriever = new ArxivRetriever({
+    returnFullDocuments: false,
+    maxSearchResults: 5,
+  });
+  const results = await retriever.invoke(fetchIds);
+
+  expect(results).toBeDefined();
+  expect(results.length).toBe(0);
 });
 
 test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => {
-    // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults
-    try {
-        const fetchIds = "2103.03404 2103.03405";
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: false,
-                maxSearchResults: -1
-            }
-        );
-        const results = await retriever.invoke(fetchIds);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405";
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: false,
+      maxSearchResults: -1,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => {
-    // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults
-    try {
-        const fetchIds = "2103.03404 2103.03405 1234.56789";
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: false,
-                maxSearchResults: -1
-            }
-        );
-        const results = await retriever.invoke(fetchIds);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
+  // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405 1234.56789";
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: false,
+      maxSearchResults: -1,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
 });
 
 test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => {
-    // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults
-    try {
-        const fetchIds = "2103.03404 2103.03405 1234.56789";
-        const retriever = new ArxivRetriever(
-            {
-                getFullDocuments: false,
-                maxSearchResults: 0
-            }
-        );
-        const results = await retriever.invoke(fetchIds);
-        expect(results).toBeUndefined();
-        expect(results.length).toBe(0);
-    } catch (error) {
-        expect(error).toBeDefined();
-        expect(error).toBeInstanceOf(Error);
-    }
-});
\ No newline at end of file
+  // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults
+  try {
+    const fetchIds = "2103.03404 2103.03405 1234.56789";
+    const retriever = new ArxivRetriever({
+      returnFullDocuments: false,
+      maxSearchResults: 0,
+    });
+    const results = await retriever.invoke(fetchIds);
+    expect(results).toBeUndefined();
+    expect(results.length).toBe(0);
+  } catch (error) {
+    expect(error).toBeDefined();
+    expect(error).toBeInstanceOf(Error);
+  }
+});
diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
index c4ee12146bcf..9052fad0eb99 100644
--- a/libs/langchain-community/src/utils/arxiv.ts
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -1,225 +1,241 @@
-import { PDFLoader } from "../document_loaders/fs/pdf.js";
-import { XMLParser } from 'fast-xml-parser'; // For parsing XML
 import { Document } from "@langchain/core/documents";
+import { XMLParser } from "fast-xml-parser";
+
+import { PDFLoader } from "../document_loaders/fs/pdf.js";
 
 // Interface for processed arXiv entry
 interface ArxivEntry {
-    id: string;
-    title: string;
-    summary: string;
-    published: string;
-    updated: string;
-    authors: string[];
-    pdfUrl: string;
-    links: any[];
+  id: string;
+  title: string;
+  summary: string;
+  published: string;
+  updated: string;
+  authors: string[];
+  pdfUrl: string;
+  links: any[];
 }
 
 // Used to check if the query is an arXiv ID, or a natural language query
 export function isArXivIdentifier(query: string): boolean {
-    const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/;
-    return arxivIdRegex.test(query.trim());
+  const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/;
+  return arxivIdRegex.test(query.trim());
 }
 
 // Used to fetch direct arXiv articles by IDs (supports multiple IDs)
-export async function fetchDirectArxivArticle(arxivIds: string): Promise<ArxivEntry[]> {
-    try {
-        const idList = arxivIds.split(/[\s,]+/).map(id => id.trim()).filter(Boolean).join(',');
-        const url = `http://export.arxiv.org/api/query?id_list=${idList}`;
-        const response = await fetch(url);
-        
-        if (!response.ok) {
-            throw new Error(`HTTP error! status: ${response.status}`);
-        }
-        
-        const xml = await response.text();
-
-        const parser = new XMLParser({
-            ignoreAttributes: false,
-            attributeNamePrefix: "@_",
-        });
-        const result = parser.parse(xml);
-        let entries = result.feed.entry;
-
-        if (!entries) {
-            return [];
-        }
-
-        // Ensure entries is an array
-        if (!Array.isArray(entries)) {
-            entries = [entries];
-        }
-
-        const processedEntries = entries.map(processEntry);
-
-        return processedEntries;
-    } catch (error) {
-        throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);
+export async function fetchDirectArxivArticle(
+  arxivIds: string
+): Promise<ArxivEntry[]> {
+  try {
+    const idList = arxivIds
+      .split(/[\s,]+/)
+      .map((id) => id.trim())
+      .filter(Boolean)
+      .join(",");
+    const url = `http://export.arxiv.org/api/query?id_list=${idList}`;
+    const response = await fetch(url);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+
+    const xml = await response.text();
+
+    const parser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: "@_",
+    });
+    const result = parser.parse(xml);
+    let entries = result.feed.entry;
+
+    if (!entries) {
+      return [];
+    }
+
+    // Ensure entries is an array
+    if (!Array.isArray(entries)) {
+      entries = [entries];
     }
+
+    const processedEntries = entries.map(processEntry);
+
+    return processedEntries;
+  } catch (error) {
+    throw new Error(`Failed to fetch articles with IDs ${arxivIds}`);
+  }
 }
 
 // Used to fetch arXiv results by natural language query with maxResults parameter
-export async function fetchArxivResultsByQuery(query: string, start = 0, maxResults = 10): Promise<ArxivEntry[]> {
-    try {
-        const encodedQuery = encodeURIComponent(query);
-        const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;
-        const response = await fetch(url);
-        
-        if (!response.ok) {
-            throw new Error(`HTTP error! status: ${response.status}`);
-        }
-        
-        const xml = await response.text();
-
-        const parser = new XMLParser({
-            ignoreAttributes: false,
-            attributeNamePrefix: "@_",
-        });
-        const result = parser.parse(xml);
-        let entries = result.feed.entry;
-
-        if (!entries) {
-            return [];
-        }
-
-        // Ensure entries is an array
-        if (!Array.isArray(entries)) {
-            entries = [entries];
-        }
-
-        const processedEntries = entries.map(processEntry);
-
-        return processedEntries;
-    } catch (error) {
-        throw new Error(`Failed to fetch articles with query "${query}"`);
+export async function fetchArxivResultsByQuery(
+  query: string,
+  start = 0,
+  maxResults = 10
+): Promise<ArxivEntry[]> {
+  try {
+    const encodedQuery = encodeURIComponent(query);
+    const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`;
+    const response = await fetch(url);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
     }
+
+    const xml = await response.text();
+
+    const parser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: "@_",
+    });
+    const result = parser.parse(xml);
+    let entries = result.feed.entry;
+
+    if (!entries) {
+      return [];
+    }
+
+    // Ensure entries is an array
+    if (!Array.isArray(entries)) {
+      entries = [entries];
+    }
+
+    const processedEntries = entries.map(processEntry);
+
+    return processedEntries;
+  } catch (error) {
+    throw new Error(`Failed to fetch articles with query "${query}"`);
+  }
 }
 
 // Used to search for arXiv articles with a maxResults parameter
-export async function searchArxiv(query: string, maxResults = 3): Promise<ArxivEntry[]> {
-    if (isArXivIdentifier(query)) {
-        return await fetchDirectArxivArticle(query);
-    } else {
-        return await fetchArxivResultsByQuery(query, 0, maxResults);
-    }
+export async function searchArxiv(
+  query: string,
+  maxResults = 3
+): Promise<ArxivEntry[]> {
+  if (isArXivIdentifier(query)) {
+    return await fetchDirectArxivArticle(query);
+  } else {
+    return await fetchArxivResultsByQuery(query, 0, maxResults);
+  }
 }
 
 // Used to fetch and parse PDF to text
 export async function fetchAndParsePDF(pdfUrl: string): Promise<string> {
-    try {
-        // Fetch the PDF
-        const response = await fetch(pdfUrl);
-        
-        if (!response.ok) {
-            throw new Error(`HTTP error! status: ${response.status}`);
-        }
-        
-        const buffer = await response.arrayBuffer();
-
-        // Convert the ArrayBuffer to a Blob
-        const blob = new Blob([buffer], { type: "application/pdf" });
-
-        // Use PDFLoader to process the PDF
-        const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob
-        const docs: Document[] = await loader.load();
-
-        // Combine all document content into a single string
-        const content = docs.map((doc) => doc.pageContent).join("\n\n");
-        return content;
-    } catch (error) {
-        throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);
+  try {
+    // Fetch the PDF
+    const response = await fetch(pdfUrl);
+
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
     }
+
+    const buffer = await response.arrayBuffer();
+
+    // Convert the ArrayBuffer to a Blob
+    const blob = new Blob([buffer], { type: "application/pdf" });
+
+    // Use PDFLoader to process the PDF
+    const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob
+    const docs: Document[] = await loader.load();
+
+    // Combine all document content into a single string
+    const content = docs.map((doc) => doc.pageContent).join("\n\n");
+    return content;
+  } catch (error) {
+    throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`);
+  }
 }
 
 // Used to load raw text from each search result, and convert to Document instances
-export async function loadDocsFromResults(results: ArxivEntry[]): Promise<Document[]> {
-    const docs: Document[] = [];
-    for (const result of results) {
-        const pdfUrl = result.pdfUrl;
-        try {
-            const pdfContent = await fetchAndParsePDF(pdfUrl);
-            const metadata = {
-                id: result.id,
-                title: result.title,
-                authors: result.authors,
-                published: result.published,
-                updated: result.updated,
-                source: 'arxiv',
-                url: result.id,
-                summary: result.summary,
-            };
-            const doc = new Document({
-                pageContent: pdfContent,
-                metadata,
-            });
-            docs.push(doc);
-        } catch (error) {
-            throw new Error(`Error loading document from ${pdfUrl}`);
-        }
+export async function loadDocsFromResults(
+  results: ArxivEntry[]
+): Promise<Document[]> {
+  const docs: Document[] = [];
+  for (const result of results) {
+    const pdfUrl = result.pdfUrl;
+    try {
+      const pdfContent = await fetchAndParsePDF(pdfUrl);
+      const metadata = {
+        id: result.id,
+        title: result.title,
+        authors: result.authors,
+        published: result.published,
+        updated: result.updated,
+        source: "arxiv",
+        url: result.id,
+        summary: result.summary,
+      };
+      const doc = new Document({
+        pageContent: pdfContent,
+        metadata,
+      });
+      docs.push(doc);
+    } catch (error) {
+      throw new Error(`Error loading document from ${pdfUrl}`);
     }
-    return docs;
+  }
+  return docs;
 }
 
 // Used to convert metadata and summaries to Document instances
 export function getDocsFromSummaries(results: ArxivEntry[]): Document[] {
-    const docs: Document[] = [];
-    for (const result of results) {
-        const metadata = {
-            id: result.id,
-            title: result.title,
-            authors: result.authors,
-            published: result.published,
-            updated: result.updated,
-            source: 'arxiv',
-            url: result.id,
-        };
-        const doc = new Document({
-            pageContent: result.summary,
-            metadata,
-        });
-        docs.push(doc);
-    }
-    return docs;
+  const docs: Document[] = [];
+  for (const result of results) {
+    const metadata = {
+      id: result.id,
+      title: result.title,
+      authors: result.authors,
+      published: result.published,
+      updated: result.updated,
+      source: "arxiv",
+      url: result.id,
+    };
+    const doc = new Document({
+      pageContent: result.summary,
+      metadata,
+    });
+    docs.push(doc);
+  }
+  return docs;
 }
 
 // Helper function to process each arXiv entry
 function processEntry(entry: any): ArxivEntry {
-    const id = entry.id;
-    const title = entry.title.replace(/\s+/g, ' ').trim();
-    const summary = entry.summary.replace(/\s+/g, ' ').trim();
-    const published = entry.published;
-    const updated = entry.updated;
-
-    // Extract authors
-    let authors: string[] = [];
-    if (Array.isArray(entry.author)) {
-        authors = entry.author.map((author: any) => author.name);
-    } else if (entry.author) {
-        authors = [entry.author.name];
-    }
-
-    // Extract links
-    let links: any[] = [];
-    if (Array.isArray(entry.link)) {
-        links = entry.link;
-    } else if (entry.link) {
-        links = [entry.link];
-    }
-
-    // Extract PDF link
-    let pdfUrl = id.replace('/abs/', '/pdf/') + '.pdf';
-    const pdfLinkObj = links.find((link: any) => link["@_title"] === 'pdf');
-    if (pdfLinkObj && pdfLinkObj["@_href"]) {
-        pdfUrl = pdfLinkObj["@_href"];
-    }
-
-    return {
-        id,
-        title,
-        summary,
-        published,
-        updated,
-        authors,
-        pdfUrl,
-        links,
-    };
-}
\ No newline at end of file
+  const id = entry.id;
+  const title = entry.title.replace(/\s+/g, " ").trim();
+  const summary = entry.summary.replace(/\s+/g, " ").trim();
+  const published = entry.published;
+  const updated = entry.updated;
+
+  // Extract authors
+  let authors: string[] = [];
+  if (Array.isArray(entry.author)) {
+    authors = entry.author.map((author: any) => author.name);
+  } else if (entry.author) {
+    authors = [entry.author.name];
+  }
+
+  // Extract links
+  let links: any[] = [];
+  if (Array.isArray(entry.link)) {
+    links = entry.link;
+  } else if (entry.link) {
+    links = [entry.link];
+  }
+
+  // Extract PDF link
+  let pdfUrl = id.replace("/abs/", "/pdf/") + ".pdf";
+  const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf");
+  if (pdfLinkObj && pdfLinkObj["@_href"]) {
+    pdfUrl = pdfLinkObj["@_href"];
+  }
+
+  return {
+    id,
+    title,
+    summary,
+    published,
+    updated,
+    authors,
+    pdfUrl,
+    links,
+  };
+}

From 20cd43cded2f26334949a392433bf79096fb8c05 Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 17:54:51 -0800
Subject: [PATCH 11/16] Rename

---
 .../retrievers/arxiv-retriever.mdx            |  2 +-
 examples/src/retrievers/arxiv.ts              |  4 +--
 .../src/retrievers/arxiv.ts                   |  9 +++---
 .../src/retrievers/tests/arxiv.int.test.ts    | 32 +++++++++----------
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index fff4da2a0a2d..8f85886c38aa 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -34,7 +34,7 @@ npm install pdf-parse fast-xml-parser
 
 ```typescript
 const retriever = new ArxivRetriever({
-  returnFullDocuments: false, // Set to true to fetch full documents (PDFs)
+  getFullDocuments: false, // Set to true to fetch full documents (PDFs)
   maxSearchResults: 5, // Maximum number of results to retrieve
 });
 ```
diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts
index fe4a4346371c..3e74502e7d49 100644
--- a/examples/src/retrievers/arxiv.ts
+++ b/examples/src/retrievers/arxiv.ts
@@ -7,7 +7,7 @@ export const run = async () => {
 
   const queryId = "1605.08386 2103.03404";
   const retrieverById = new ArxivRetriever({
-    returnFullDocuments: true,
+    getFullDocuments: true,
     maxSearchResults: 5,
   });
   const documentsById = await retrieverById.invoke(queryId);
@@ -42,7 +42,7 @@ export const run = async () => {
 
   const queryNat = "What is the ImageBind model?";
   const retrieverByNat = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 2,
   });
   const documentsByQuery = await retrieverByNat.invoke(queryNat);
diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts
index 2eb0ab6c5675..8009ce9f8320 100644
--- a/libs/langchain-community/src/retrievers/arxiv.ts
+++ b/libs/langchain-community/src/retrievers/arxiv.ts
@@ -7,7 +7,7 @@ import {
 } from "../utils/arxiv.js";
 
 export type ArxivRetrieverOptions = {
-  returnFullDocuments?: boolean;
+  getFullDocuments?: boolean;
   maxSearchResults?: number;
 } & BaseRetrieverInput;
 
@@ -22,14 +22,13 @@ export class ArxivRetriever extends BaseRetriever {
 
   lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
 
-  returnFullDocuments = false;
+  getFullDocuments = false;
 
   maxSearchResults = 10;
 
   constructor(options: ArxivRetrieverOptions = {}) {
     super(options);
-    this.returnFullDocuments =
-      options.returnFullDocuments ?? this.returnFullDocuments;
+    this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments;
     this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults;
   }
 
@@ -37,7 +36,7 @@ export class ArxivRetriever extends BaseRetriever {
     try {
       const results = await searchArxiv(query, this.maxSearchResults);
 
-      if (this.returnFullDocuments) {
+      if (this.getFullDocuments) {
         // Fetch and parse PDFs to get full documents
         return await loadDocsFromResults(results);
       } else {
diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
index 2d8467fe1f57..bb05f11504e5 100644
--- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
+++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts
@@ -4,7 +4,7 @@ import { ArxivRetriever } from "../arxiv.js";
 test("ArxivRetriever fetching document summaries test", async () => {
   // Sample integration test for ArxivRetriever using the "machine learning" query
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const query = "machine learning";
@@ -43,7 +43,7 @@ test("ArxivRetriever fetching document summaries test", async () => {
 test("ArxivRetriever fetching document summaries with invalid query test", async () => {
   // Sample test for ArxivRetriever using an invalid query
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const query = "fjalsdkjfw";
@@ -56,7 +56,7 @@ test("ArxivRetriever fetching document summaries with invalid query test", async
 test("ArxivRetriever fetching document summaries with empty query test", async () => {
   // Sample test for ArxivRetriever using an empty query
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const query = "";
@@ -70,7 +70,7 @@ test("ArxivRetriever fetching document summaries with invalid maxSearchResults t
   // Sample test for ArxivRetriever using an invalid maxSearchResults
   try {
     const retriever = new ArxivRetriever({
-      returnFullDocuments: true,
+      getFullDocuments: true,
       maxSearchResults: -1,
     });
     const query = "machine learning";
@@ -87,7 +87,7 @@ test("ArxivRetriever fetching document summaries with zero maxSearchResults test
   // Sample test for ArxivRetriever using an zero maxSearchResults
   try {
     const retriever = new ArxivRetriever({
-      returnFullDocuments: true,
+      getFullDocuments: true,
       maxSearchResults: 0,
     });
     const query = "machine learning";
@@ -103,7 +103,7 @@ test("ArxivRetriever fetching document summaries with zero maxSearchResults test
 test("ArxivRetriever fetching full documents test", async () => {
   // Sample test for fetching full documents with ArxivRetriever
   const retriever = new ArxivRetriever({
-    returnFullDocuments: true,
+    getFullDocuments: true,
     maxSearchResults: 5,
   });
   const query = "machine learning";
@@ -145,7 +145,7 @@ test("ArxivRetriever fetching full documents test", async () => {
 test("ArxivRetriever fetching full documents with invalid query test", async () => {
   // Sample test for fetching full documents with ArxivRetriever using an invalid query
   const retriever = new ArxivRetriever({
-    returnFullDocuments: true,
+    getFullDocuments: true,
     maxSearchResults: 5,
   });
   const query = "fjalsdkjfw";
@@ -158,7 +158,7 @@ test("ArxivRetriever fetching full documents with invalid query test", async ()
 test("ArxivRetriever fetching full documents with empty query test", async () => {
   // Sample test for fetching full documents with ArxivRetriever using an empty query
   const retriever = new ArxivRetriever({
-    returnFullDocuments: true,
+    getFullDocuments: true,
     maxSearchResults: 5,
   });
   const query = "";
@@ -172,7 +172,7 @@ test("ArxivRetriever fetching full documents with invalid maxSearchResults test"
   // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults
   try {
     const retriever = new ArxivRetriever({
-      returnFullDocuments: true,
+      getFullDocuments: true,
       maxSearchResults: -1,
     });
     const query = "machine learning";
@@ -189,7 +189,7 @@ test("ArxivRetriever fetching full documents with zero maxSearchResults", async
   // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults
   try {
     const retriever = new ArxivRetriever({
-      returnFullDocuments: true,
+      getFullDocuments: true,
       maxSearchResults: 0,
     });
     const query = "machine learning";
@@ -206,7 +206,7 @@ test("ArxivRetriever search articles by id test", async () => {
   // Sample test for fetching articles by arXiv IDs
   const fetchIds = "2103.03404 2103.03405";
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const results = await retriever.invoke(fetchIds);
@@ -244,7 +244,7 @@ test("ArxivRetriever search articles by id with invalid id test", async () => {
   // Sample test for fetching articles by arXiv IDs with an invalid ID
   const fetchIds = "2103.03404 2103.03405 1234.56789";
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const results = await retriever.invoke(fetchIds);
@@ -257,7 +257,7 @@ test("ArxivRetriever search articles by id with empty id test", async () => {
   // Sample test for fetching articles by arXiv IDs with an empty ID
   const fetchIds = "";
   const retriever = new ArxivRetriever({
-    returnFullDocuments: false,
+    getFullDocuments: false,
     maxSearchResults: 5,
   });
   const results = await retriever.invoke(fetchIds);
@@ -271,7 +271,7 @@ test("ArxivRetriever search articles by id with invalid maxSearchResults test",
   try {
     const fetchIds = "2103.03404 2103.03405";
     const retriever = new ArxivRetriever({
-      returnFullDocuments: false,
+      getFullDocuments: false,
       maxSearchResults: -1,
     });
     const results = await retriever.invoke(fetchIds);
@@ -288,7 +288,7 @@ test("ArxivRetriever search articles by id with invalid id and maxSearchResults
   try {
     const fetchIds = "2103.03404 2103.03405 1234.56789";
     const retriever = new ArxivRetriever({
-      returnFullDocuments: false,
+      getFullDocuments: false,
       maxSearchResults: -1,
     });
     const results = await retriever.invoke(fetchIds);
@@ -305,7 +305,7 @@ test("ArxivRetriever search articles by id with invalid id and zero maxSearchRes
   try {
     const fetchIds = "2103.03404 2103.03405 1234.56789";
     const retriever = new ArxivRetriever({
-      returnFullDocuments: false,
+      getFullDocuments: false,
       maxSearchResults: 0,
     });
     const results = await retriever.invoke(fetchIds);

From a630c44f8b3504e3c81b33d591eb718c2ab12240 Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 17:56:42 -0800
Subject: [PATCH 12/16] Fix

---
 .../docs/integrations/retrievers/arxiv-retriever.mdx        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index 8f85886c38aa..2cdefddbbb1c 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -19,7 +19,7 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti
 | ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
 | `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |
 
-## Setup/Installation
+## Setup
 
 Ensure the following dependencies are installed:
 
@@ -30,7 +30,7 @@ Ensure the following dependencies are installed:
 npm install pdf-parse fast-xml-parser
 ```
 
-## Instantiate the retriever
+## Instantiation
 
 ```typescript
 const retriever = new ArxivRetriever({
@@ -96,4 +96,6 @@ const ragChain = RunnableSequence.from([
 await ragChain.invoke("What are the latest advances in quantum computing?");
 ```
 
+## API reference
+
 For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)

From 7c4f09f9a0aa7988f198ebf32a369d205bed8edb Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:12:28 -0800
Subject: [PATCH 13/16] Add optional dep

---
 libs/langchain-community/package.json |  5 +++++
 yarn.lock                             | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
index 447a3eda9484..1a45528ec5b6 100644
--- a/libs/langchain-community/package.json
+++ b/libs/langchain-community/package.json
@@ -166,6 +166,7 @@
     "eslint-plugin-no-instanceof": "^1.0.1",
     "eslint-plugin-prettier": "^4.2.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "^4.5.1",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "^9.10.0",
     "googleapis": "^126.0.1",
@@ -302,6 +303,7 @@
     "duck-duck-scrape": "^2.2.5",
     "epub2": "^3.0.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "*",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "*",
     "googleapis": "*",
@@ -584,6 +586,9 @@
     "faiss-node": {
       "optional": true
     },
+    "fast-xml-parser": {
+      "optional": true
+    },
     "firebase-admin": {
       "optional": true
     },
diff --git a/yarn.lock b/yarn.lock
index 9e5a48455320..abae3190907d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -11908,6 +11908,7 @@ __metadata:
     eslint-plugin-prettier: ^4.2.1
     expr-eval: ^2.0.2
     faiss-node: ^0.5.1
+    fast-xml-parser: ^4.5.1
     firebase-admin: ^11.9.0 || ^12.0.0
     flat: ^5.0.2
     google-auth-library: ^9.10.0
@@ -12050,6 +12051,7 @@ __metadata:
     duck-duck-scrape: ^2.2.5
     epub2: ^3.0.1
     faiss-node: ^0.5.1
+    fast-xml-parser: "*"
     firebase-admin: ^11.9.0 || ^12.0.0
     google-auth-library: "*"
     googleapis: "*"
@@ -12252,6 +12254,8 @@ __metadata:
       optional: true
     faiss-node:
       optional: true
+    fast-xml-parser:
+      optional: true
     firebase-admin:
       optional: true
     google-auth-library:
@@ -28227,6 +28231,17 @@ __metadata:
   languageName: node
   linkType: hard
 
+"fast-xml-parser@npm:^4.5.1":
+  version: 4.5.1
+  resolution: "fast-xml-parser@npm:4.5.1"
+  dependencies:
+    strnum: ^1.0.5
+  bin:
+    fxparser: src/cli/cli.js
+  checksum: aab32d7f08a95b20f9ecdc2d769531a9dc454faf12740873972f8169c04ab9335ac5df1029ebfe829a01ddbb0ec60572cb7769d6be2409e95a9be8fc6a86e92c
+  languageName: node
+  linkType: hard
+
 "fastq@npm:^1.6.0":
   version: 1.15.0
   resolution: "fastq@npm:1.15.0"

From b640c3917aa9cc54810ba75f76006f9e6995fa51 Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:14:05 -0800
Subject: [PATCH 14/16] Lint

---
 libs/langchain-community/src/utils/arxiv.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts
index 9052fad0eb99..6a79b78a776a 100644
--- a/libs/langchain-community/src/utils/arxiv.ts
+++ b/libs/langchain-community/src/utils/arxiv.ts
@@ -1,3 +1,4 @@
+/* eslint-disable import/no-extraneous-dependencies */
 import { Document } from "@langchain/core/documents";
 import { XMLParser } from "fast-xml-parser";
 

From 7e49ac24f07941d961c80109bd8284597f0c2f0d Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:14:39 -0800
Subject: [PATCH 15/16] Fix

---
 docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index 2cdefddbbb1c..cb4ad949dd1a 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -1,7 +1,5 @@
 # ArxivRetriever
 
----
-
 ## Overview
 
 The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)

From 25a96f6f770fef374c56c02058f05c84ae8247fd Mon Sep 17 00:00:00 2001
From: jacoblee93 <jacoblee93@gmail.com>
Date: Mon, 23 Dec 2024 18:39:22 -0800
Subject: [PATCH 16/16] Fix docs

---
 .../docs/integrations/retrievers/arxiv-retriever.mdx          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
index cb4ad949dd1a..254c90ca49fe 100644
--- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
+++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -1,8 +1,8 @@
 # ArxivRetriever
 
-## Overview
+The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.
 
-The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
 
 ## Features