From 8f140d790feeb2078603d30b581253c80835aa50 Mon Sep 17 00:00:00 2001 From: Antonio Ferreras Date: Tue, 12 Nov 2024 00:30:28 -0500 Subject: [PATCH 01/16] create ArxivRetriever, arxiv utils file, and config updates --- libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 1 + libs/langchain-community/package.json | 13 ++ .../src/load/import_map.ts | 1 + .../src/retrievers/arxiv.ts | 45 ++++ libs/langchain-community/src/utils/arxiv.ts | 201 ++++++++++++++++++ 6 files changed, 265 insertions(+) create mode 100644 libs/langchain-community/src/retrievers/arxiv.ts create mode 100644 libs/langchain-community/src/utils/arxiv.ts diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 890c93717dea..7e32a755f495 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -610,6 +610,10 @@ retrievers/amazon_knowledge_base.cjs retrievers/amazon_knowledge_base.js retrievers/amazon_knowledge_base.d.ts retrievers/amazon_knowledge_base.d.cts +retrievers/arxiv.cjs +retrievers/arxiv.js +retrievers/arxiv.d.ts +retrievers/arxiv.d.cts retrievers/bm25.cjs retrievers/bm25.js retrievers/bm25.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 63b495f92f2c..bb88b074ef48 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -193,6 +193,7 @@ export const config = { // retrievers "retrievers/amazon_kendra": "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base", + "retrievers/arxiv": "retrievers/arxiv", "retrievers/bm25": "retrievers/bm25", "retrievers/chaindesk": "retrievers/chaindesk", "retrievers/databerry": "retrievers/databerry", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index a1f60050f981..e8746e15b833 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -2085,6 +2085,15 @@ "import": "./retrievers/amazon_knowledge_base.js", "require": "./retrievers/amazon_knowledge_base.cjs" }, + "./retrievers/arxiv": { + "types": { + "import": "./retrievers/arxiv.d.ts", + "require": "./retrievers/arxiv.d.cts", + "default": "./retrievers/arxiv.d.ts" + }, + "import": "./retrievers/arxiv.js", + "require": "./retrievers/arxiv.cjs" + }, "./retrievers/bm25": { "types": { "import": "./retrievers/bm25.d.ts", @@ -3673,6 +3682,10 @@ "retrievers/amazon_knowledge_base.js", "retrievers/amazon_knowledge_base.d.ts", "retrievers/amazon_knowledge_base.d.cts", + "retrievers/arxiv.cjs", + "retrievers/arxiv.js", + "retrievers/arxiv.d.ts", + "retrievers/arxiv.d.cts", "retrievers/bm25.cjs", "retrievers/bm25.js", "retrievers/bm25.d.ts", diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 5bbd9e4d0a01..76870173d7da 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -54,6 +54,7 @@ export * as chat_models__moonshot from "../chat_models/moonshot.js"; export * as chat_models__ollama from "../chat_models/ollama.js"; export * as chat_models__togetherai from "../chat_models/togetherai.js"; export * as chat_models__yandex from "../chat_models/yandex.js"; +export * as retrievers__arxiv from "../retrievers/arxiv.js"; export * as retrievers__bm25 from "../retrievers/bm25.js"; export * as retrievers__chaindesk from "../retrievers/chaindesk.js"; export * as retrievers__databerry from "../retrievers/databerry.js"; diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts new file mode 100644 index 000000000000..b8bb5a524eea --- /dev/null +++ b/libs/langchain-community/src/retrievers/arxiv.ts @@ -0,0 +1,45 @@ +import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; +import { Document } from "@langchain/core/documents"; +import { searchArxiv, loadDocsFromResults, getDocsFromSummaries } from '../utils/arxiv.js'; + +export type ArxivRetrieverOptions = { + getFullDocuments?: boolean; + maxSearchResults?: number; +} & BaseRetrieverInput; + +/** + * A retriever that searches arXiv for relevant articles based on a query. + * It can retrieve either full documents (PDFs) or just summaries. + */ +export class ArxivRetriever extends BaseRetriever { + static lc_name() { + return "ArxivRetriever"; + } + + lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; + + getFullDocuments: boolean; + maxSearchResults: number; + + constructor(options: ArxivRetrieverOptions = {}) { + super(options); + this.getFullDocuments = options.getFullDocuments ?? false; + this.maxSearchResults = options.maxSearchResults ?? 10; + } + + async _getRelevantDocuments(query: string): Promise { + try { + const results = await searchArxiv(query, this.maxSearchResults); + + if (this.getFullDocuments) { + // Fetch and parse PDFs to get full documents + return await loadDocsFromResults(results); + } else { + // Use summaries as documents + return getDocsFromSummaries(results); + } + } catch (error) { + throw new Error(`Error retrieving documents from arXiv.`); + } + } +} diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts new file mode 100644 index 000000000000..e62e228876e2 --- /dev/null +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -0,0 +1,201 @@ +import axios from 'axios'; // For HTTP requests +import pdfParse from 'pdf-parse'; // For parsing PDFs +import { XMLParser } from 'fast-xml-parser'; // For parsing XML +import { Document } from "@langchain/core/documents"; + +// Interface for processed arXiv entry +interface ArxivEntry { + id: string; + title: string; + summary: string; + published: string; + updated: string; + authors: string[]; + pdfUrl: string; + links: any[]; +} + +// Used to check if the query is an arXiv ID, or a natural language query +export function isArXivIdentifier(query: string): boolean { + const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/; + return arxivIdRegex.test(query.trim()); +} + +// Used to fetch direct arXiv articles by IDs (supports multiple IDs) +export async function fetchDirectArxivArticle(arxivIds: string): Promise { + try { + const idList = arxivIds.split(/[\s,]+/).map(id => id.trim()).filter(Boolean).join(','); + const url = `http://export.arxiv.org/api/query?id_list=${idList}`; + const response = await axios.get(url); + const xml = response.data; + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; + } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with IDs ${arxivIds}`); + } +} + +// Used to fetch arXiv results by natural language query with maxResults parameter +export async function fetchArxivResultsByQuery(query: string, start = 0, maxResults = 10): Promise { + try { + const encodedQuery = encodeURIComponent(query); + const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`; + const response = await axios.get(url); + const xml = response.data; + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; + } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with query "${query}"`); + } +} + +// Used to search for arXiv articles with a maxResults parameter +export async function searchArxiv(query: string, maxResults = 3): Promise { + if (isArXivIdentifier(query)) { + return await fetchDirectArxivArticle(query); + } else { + return await fetchArxivResultsByQuery(query, 0, maxResults); + } +} + +// Used to fetch and parse PDF to text +export async function fetchAndParsePDF(pdfUrl: string): Promise { + try { + const response = await axios.get(pdfUrl, { responseType: 'arraybuffer' }); + const buffer = Buffer.from(response.data); + const data = await pdfParse(buffer); + return data.text; + } catch (error) { + throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`); + } +} + +// Used to load raw text from each search result, and convert to Document instances +export async function loadDocsFromResults(results: ArxivEntry[]): Promise { + const docs: Document[] = []; + for (const result of results) { + const pdfUrl = result.pdfUrl; + try { + const pdfContent = await fetchAndParsePDF(pdfUrl); + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: 'arxiv', + url: result.id, + summary: result.summary, + }; + const doc = new Document({ + pageContent: pdfContent, + metadata, + }); + docs.push(doc); + } catch (error) { + throw new Error(`Error loading document from ${pdfUrl}`); + } + } + return docs; +} + +// Used to convert metadata and summaries to Document instances +export function getDocsFromSummaries(results: ArxivEntry[]): Document[] { + const docs: Document[] = []; + for (const result of results) { + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: 'arxiv', + url: result.id, + }; + const doc = new Document({ + pageContent: result.summary, + metadata, + }); + docs.push(doc); + } + return docs; +} + +// Helper function to process each arXiv entry +function processEntry(entry: any): ArxivEntry { + const id = entry.id; + const title = entry.title.replace(/\s+/g, ' ').trim(); + const summary = entry.summary.replace(/\s+/g, ' ').trim(); + const published = entry.published; + const updated = entry.updated; + + // Extract authors + let authors: string[] = []; + if (Array.isArray(entry.author)) { + authors = entry.author.map((author: any) => author.name); + } else if (entry.author) { + authors = [entry.author.name]; + } + + // Extract links + let links: any[] = []; + if (Array.isArray(entry.link)) { + links = entry.link; + } else if (entry.link) { + links = [entry.link]; + } + + // Extract PDF link + let pdfUrl = id.replace('/abs/', '/pdf/') + '.pdf'; + const pdfLinkObj = links.find((link: any) => link["@_title"] === 'pdf'); + if (pdfLinkObj && pdfLinkObj["@_href"]) { + pdfUrl = pdfLinkObj["@_href"]; + } + + return { + id, + title, + summary, + published, + updated, + authors, + pdfUrl, + links, + }; +} From b4d4a699c645f2aaf4cc8e8360ba8f6f43151160 Mon Sep 17 00:00:00 2001 From: Dhruvin Patel Date: Mon, 18 Nov 2024 13:24:50 -0500 Subject: [PATCH 02/16] Documentation for Arxiv-Retriever --- .../retrievers/arxiv-retriever.mdx | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx new file mode 100644 index 000000000000..23a67e7a98df --- /dev/null +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -0,0 +1,148 @@ +# Documentation for ArxivRetriever in LangChain.js +--- + +## Overview + +The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. + +--- + +## Installation + +Ensure the following dependencies are installed: +- `axios` for making HTTP requests +- `pdf-parse` for parsing PDFs +- `fast-xml-parser` for parsing XML responses from the arXiv API + +```bash +npm install axios pdf-parse fast-xml-parser +``` +--- +## Features +- Query Flexibility: Search using natural language queries or specific arXiv IDs. +- Full-Document Retrieval: Option to fetch and parse PDFs. +- Summaries as Documents: Retrieve summaries for faster results. +- Customizable Options: Configure maximum results and output format. + +--- + +## Getting started + +#### Import the path +```bash +import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; +``` +--- + +## Class: ArxivRetriever + +### Parameters + +| Name | Type | Default | Description | +|-------------------|-----------|---------|------------------------------------------------------| +| `getFullDocuments` | `boolean` | `false` | Whether to fetch full documents (PDFs) instead of summaries. | +| `maxSearchResults` | `number` | `10` | Maximum number of results to fetch from arXiv. | + + + +### Methods + +### `_getRelevantDocuments(query: string): Promise` + +Fetches documents from arXiv based on the input query. + +#### Parameters + +| Name | Type | Description | +|--------|----------|----------------------------------------| +| `query` | `string` | A natural language query or arXiv ID. | + +#### Returns +A `Promise` that resolves to an array of LangChain `Document` instances. + +#### Example +```typescript +const documents = await retriever._getRelevantDocuments("machine learning in climate science"); +console.log(documents); +``` +--- + +## Utility Functions + +## `isArXivIdentifier(query: string): boolean` + +Checks if a query is a valid arXiv ID. + +### Parameters + +| Name | Type | Description | +|--------|----------|-----------------------------------| +| `query` | `string` | Query to check for arXiv ID format. | + +### Returns + +`true` if the query is a valid arXiv ID; otherwise, `false`. + + +## `fetchDirectArxivArticle(arxivIds: string): Promise` + +Fetches arXiv articles using specific arXiv IDs. + +### Parameters + +| Name | Type | Description | +|------------|----------|---------------------------------------| +| `arxivIds` | `string` | Comma-separated list of arXiv IDs. | + +### Returns + +A `Promise` that resolves to an array of `ArxivEntry` objects. + + +## `fetchArxivResultsByQuery(query: string, maxResults: number): Promise` + +Fetches results from arXiv using a natural language query. + +### Parameters + +| Name | Type | Default | Description | +|--------------|----------|---------|--------------------------------------| +| `query` | `string` | | Search query. | +| `maxResults` | `number` | `10` | Maximum number of results to fetch. | + +### Returns + +A `Promise` that resolves to an array of `ArxivEntry` objects. + + +## `fetchAndParsePDF(pdfUrl: string): Promise` + +Fetches a PDF document and parses its content into text. + +### Parameters + +| Name | Type | Description | +|----------|----------|-----------------------------| +| `pdfUrl` | `string` | URL of the PDF to retrieve. | + +### Returns + +A `Promise` that resolves to the parsed text of the PDF. + +--- + +## Example +```bash +import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; + +const retriever = new ArxivRetriever({ + getFullDocuments: false, + maxSearchResults: 3, +}); + +const documents = await retriever._getRelevantDocuments("neural networks in optimization"); +documents.forEach(doc => { + console.log("Title:", doc.metadata.title); + console.log("Summary:", doc.pageContent); +}); +``` From 5b8958f4d2c749ca6217a92b24b2f48c736f2f89 Mon Sep 17 00:00:00 2001 From: Dhruvin Patel Date: Tue, 19 Nov 2024 18:57:41 -0500 Subject: [PATCH 03/16] Edit the documentation for arXIV --- .../retrievers/arxiv-retriever.mdx | 116 ++++-------------- 1 file changed, 23 insertions(+), 93 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index 23a67e7a98df..5009109e3cea 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -1,4 +1,4 @@ -# Documentation for ArxivRetriever in LangChain.js +# ArxivRetriever in LangChain.js (Docs) --- ## Overview @@ -7,6 +7,13 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti --- +## Features +- Query Flexibility: Search using natural language queries or specific arXiv IDs. +- Full-Document Retrieval: Option to fetch and parse PDFs. +- Summaries as Documents: Retrieve summaries for faster results. +- Customizable Options: Configure maximum results and output format. + +--- ## Installation Ensure the following dependencies are installed: @@ -17,21 +24,22 @@ Ensure the following dependencies are installed: ```bash npm install axios pdf-parse fast-xml-parser ``` ---- -## Features -- Query Flexibility: Search using natural language queries or specific arXiv IDs. -- Full-Document Retrieval: Option to fetch and parse PDFs. -- Summaries as Documents: Retrieve summaries for faster results. -- Customizable Options: Configure maximum results and output format. - --- ## Getting started #### Import the path -```bash +```typescript import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; ``` + +#### Instantiate the retriever +```typescript +const retriever = new ArxivRetriever({ + getFullDocuments: false, // Set to true to fetch full documents (PDFs) + maxSearchResults: 5, // Maximum number of results to retrieve +}); +``` --- ## Class: ArxivRetriever @@ -47,9 +55,9 @@ import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; ### Methods -### `_getRelevantDocuments(query: string): Promise` +### `invoke(query: string): Promise` -Fetches documents from arXiv based on the input query. +Use the invoke method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. #### Parameters @@ -62,87 +70,9 @@ A `Promise` that resolves to an array of LangChain `Document` instances. #### Example ```typescript -const documents = await retriever._getRelevantDocuments("machine learning in climate science"); -console.log(documents); -``` ---- - -## Utility Functions - -## `isArXivIdentifier(query: string): boolean` - -Checks if a query is a valid arXiv ID. - -### Parameters - -| Name | Type | Description | -|--------|----------|-----------------------------------| -| `query` | `string` | Query to check for arXiv ID format. | - -### Returns - -`true` if the query is a valid arXiv ID; otherwise, `false`. - - -## `fetchDirectArxivArticle(arxivIds: string): Promise` - -Fetches arXiv articles using specific arXiv IDs. - -### Parameters - -| Name | Type | Description | -|------------|----------|---------------------------------------| -| `arxivIds` | `string` | Comma-separated list of arXiv IDs. | - -### Returns - -A `Promise` that resolves to an array of `ArxivEntry` objects. - - -## `fetchArxivResultsByQuery(query: string, maxResults: number): Promise` - -Fetches results from arXiv using a natural language query. - -### Parameters - -| Name | Type | Default | Description | -|--------------|----------|---------|--------------------------------------| -| `query` | `string` | | Search query. | -| `maxResults` | `number` | `10` | Maximum number of results to fetch. | - -### Returns - -A `Promise` that resolves to an array of `ArxivEntry` objects. - - -## `fetchAndParsePDF(pdfUrl: string): Promise` - -Fetches a PDF document and parses its content into text. - -### Parameters - -| Name | Type | Description | -|----------|----------|-----------------------------| -| `pdfUrl` | `string` | URL of the PDF to retrieve. | - -### Returns - -A `Promise` that resolves to the parsed text of the PDF. - ---- - -## Example -```bash -import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; - -const retriever = new ArxivRetriever({ - getFullDocuments: false, - maxSearchResults: 3, -}); - -const documents = await retriever._getRelevantDocuments("neural networks in optimization"); +const documents = await retriever.invoke("quantum computing"); documents.forEach(doc => { - console.log("Title:", doc.metadata.title); - console.log("Summary:", doc.pageContent); + console.log("Title:", doc.metadata.title); + console.log("Content:", doc.pageContent); // Parsed PDF content }); -``` +``` \ No newline at end of file From 47dcac0a4b80a1d8fe96c323cd1a719e46d02ac7 Mon Sep 17 00:00:00 2001 From: Yiran Gogo Yu Date: Tue, 19 Nov 2024 23:17:23 -0500 Subject: [PATCH 04/16] Create integration test for Arxiv-Retriever --- .../src/retrievers/tests/arxiv.int.test.ts | 42 +++++++++++++++++++ libs/langchain-community/src/utils/arxiv.ts | 20 ++++++--- 2 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts new file mode 100644 index 000000000000..071bce9d91aa --- /dev/null +++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts @@ -0,0 +1,42 @@ +import { test, expect } from "@jest/globals"; +import { ArxivRetriever } from "../arxiv.js"; + +test("ArxivRetriever integration test", async () => { + // Sample integration test for ArxivRetriever using the "machine learning" query + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("id"); + expect(results[i].id).toBeUndefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } +}); \ No newline at end of file diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts index e62e228876e2..74439e45c69a 100644 --- a/libs/langchain-community/src/utils/arxiv.ts +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -1,5 +1,5 @@ import axios from 'axios'; // For HTTP requests -import pdfParse from 'pdf-parse'; // For parsing PDFs +import { PDFLoader } from "../document_loaders/fs/pdf.js"; import { XMLParser } from 'fast-xml-parser'; // For parsing XML import { Document } from "@langchain/core/documents"; @@ -97,10 +97,20 @@ export async function searchArxiv(query: string, maxResults = 3): Promise { try { - const response = await axios.get(pdfUrl, { responseType: 'arraybuffer' }); + // Fetch the PDF as an array buffer + const response = await axios.get(pdfUrl, { responseType: "arraybuffer" }); const buffer = Buffer.from(response.data); - const data = await pdfParse(buffer); - return data.text; + + // Convert the Buffer to a Blob + const blob = new Blob([buffer], { type: "application/pdf" }); + + // Use PDFLoader to process the PDF + const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob + const docs: Document[] = await loader.load(); + + // Combine all document content into a single string + const content = docs.map((doc) => doc.pageContent).join("\n\n"); + return content; } catch (error) { throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`); } @@ -198,4 +208,4 @@ function processEntry(entry: any): ArxivEntry { pdfUrl, links, }; -} +} \ No newline at end of file From f00deda9b14bf5085572eceb527fba565c2393d2 Mon Sep 17 00:00:00 2001 From: Yiran Gogo Yu Date: Wed, 20 Nov 2024 20:45:55 -0500 Subject: [PATCH 05/16] Update integration test for arxiv retriever --- .../src/retrievers/tests/arxiv.int.test.ts | 300 +++++++++++++++++- 1 file changed, 298 insertions(+), 2 deletions(-) diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts index 071bce9d91aa..11eb1040ed18 100644 --- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts +++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts @@ -1,7 +1,7 @@ import { test, expect } from "@jest/globals"; import { ArxivRetriever } from "../arxiv.js"; -test("ArxivRetriever integration test", async () => { +test("ArxivRetriever fetching document summaries test", async () => { // Sample integration test for ArxivRetriever using the "machine learning" query const retriever = new ArxivRetriever( { @@ -16,12 +16,116 @@ test("ArxivRetriever integration test", async () => { expect(results.length).toBeGreaterThan(0); expect(results.length).toBeLessThanOrEqual(5); + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } +}); + +test("ArxivRetriever fetching document summaries with invalid query test", async () => { + // Sample test for ArxivRetriever using an invalid query + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching document summaries with empty query test", async () => { + // Sample test for ArxivRetriever using an empty query + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => { + // Sample test for ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: -1 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => { + // Sample test for ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: 0 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching full documents test", async () => { + // Sample test for fetching full documents with ArxivRetriever + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: 5 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + for (let i = 0; i < results.length; i += 1) { expect(results[i]).toHaveProperty("pageContent"); expect(results[i].pageContent).toBeDefined(); expect(results[i]).toHaveProperty("id"); - expect(results[i].id).toBeUndefined(); expect(results[i]).toHaveProperty("metadata"); expect(results[i].metadata).toBeInstanceOf(Object); @@ -38,5 +142,197 @@ test("ArxivRetriever integration test", async () => { expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); expect(results[i].metadata).toHaveProperty("url"); expect(results[i].metadata.url).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("summary"); + } +}); + +test("ArxivRetriever fetching full documents with invalid query test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an invalid query + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: 5 + } + ); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching full documents with empty query test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an empty query + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: 5 + } + ); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => { + // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: -1 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => { + // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever( + { + getFullDocuments: true, + maxSearchResults: 0 + } + ); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id test", async () => { + // Sample test for fetching articles by arXiv IDs + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(2); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } +}); + +test("ArxivRetriever search articles by id with invalid id test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBeLessThan(3); +}); + +test("ArxivRetriever search articles by id with empty id test", async () => { + // Sample test for fetching articles by arXiv IDs with an empty ID + const fetchIds = ""; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 5 + } + ); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); +}); + +test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: -1 + } + ); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: -1 + } + ); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); + +test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => { + // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 0 + } + ); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); } }); \ No newline at end of file From e52a6e12ff4efb253e6e8e7b3d4da3efaace8be9 Mon Sep 17 00:00:00 2001 From: boni-teppanyaki Date: Fri, 22 Nov 2024 19:45:50 -0500 Subject: [PATCH 06/16] Add example usage file for arxiv retriever --- examples/src/retrievers/arxiv.ts | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 examples/src/retrievers/arxiv.ts diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts new file mode 100644 index 000000000000..3c164844474e --- /dev/null +++ b/examples/src/retrievers/arxiv.ts @@ -0,0 +1,67 @@ +import { ArxivRetriever } from "../../../libs/langchain-community/src/retrievers/arxiv.js"; + +export const run = async () => { + /* + Direct look up by arXiv ID, for full texts + */ + + const queryId = "1605.08386 2103.03404"; + const retrieverById = new ArxivRetriever({ + getFullDocuments: true, + maxSearchResults: 5 + }); + const documentsById = await retrieverById.invoke(queryId); + console.log(documentsById); + + /* + [ + Document + { + pageContent, + metadata: + { + author, + id, + published, + source, + updated, + url + } + }, + Document + { + pageContent, + metadata + } + ] + */ + + /* + Search with natural language query, for summaries + */ + + const queryNat = "What is the ImageBind model?"; + const retrieverByNat = new ArxivRetriever( + { + getFullDocuments: false, + maxSearchResults: 2 + } + ); + const documentsByQuery = await retrieverByNat.invoke(queryNat); + console.log(documentsByQuery); + + /* + [ + Document + { + pageContent, + metadata + }, + Document + { + pageContent, + metadata + } + ] + */ +}; \ No newline at end of file From caa109c498c4b927ae160fee8a910aa9687352ad Mon Sep 17 00:00:00 2001 From: Yiran Gogo Yu Date: Thu, 12 Dec 2024 16:54:24 -0500 Subject: [PATCH 07/16] Updated file to use fetch() instead of axios.get() 1. Removed the import axios line in src/utils/arxiv.ts 2. Removed the dependencies to use axios in "ArxivRetriever in LangChain.js (Docs)" file --- .../retrievers/arxiv-retriever.mdx | 3 +- libs/langchain-community/src/utils/arxiv.ts | 32 +++++++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index 5009109e3cea..f82b5a76237e 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -17,12 +17,11 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti ## Installation Ensure the following dependencies are installed: -- `axios` for making HTTP requests - `pdf-parse` for parsing PDFs - `fast-xml-parser` for parsing XML responses from the arXiv API ```bash -npm install axios pdf-parse fast-xml-parser +npm install pdf-parse fast-xml-parser ``` --- diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts index 74439e45c69a..c4ee12146bcf 100644 --- a/libs/langchain-community/src/utils/arxiv.ts +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -1,4 +1,3 @@ -import axios from 'axios'; // For HTTP requests import { PDFLoader } from "../document_loaders/fs/pdf.js"; import { XMLParser } from 'fast-xml-parser'; // For parsing XML import { Document } from "@langchain/core/documents"; @@ -26,8 +25,13 @@ export async function fetchDirectArxivArticle(arxivIds: string): Promise id.trim()).filter(Boolean).join(','); const url = `http://export.arxiv.org/api/query?id_list=${idList}`; - const response = await axios.get(url); - const xml = response.data; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const xml = await response.text(); const parser = new XMLParser({ ignoreAttributes: false, @@ -58,8 +62,13 @@ export async function fetchArxivResultsByQuery(query: string, start = 0, maxResu try { const encodedQuery = encodeURIComponent(query); const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`; - const response = await axios.get(url); - const xml = response.data; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const xml = await response.text(); const parser = new XMLParser({ ignoreAttributes: false, @@ -97,11 +106,16 @@ export async function searchArxiv(query: string, maxResults = 3): Promise { try { - // Fetch the PDF as an array buffer - const response = await axios.get(pdfUrl, { responseType: "arraybuffer" }); - const buffer = Buffer.from(response.data); + // Fetch the PDF + const response = await fetch(pdfUrl); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const buffer = await response.arrayBuffer(); - // Convert the Buffer to a Blob + // Convert the ArrayBuffer to a Blob const blob = new Blob([buffer], { type: "application/pdf" }); // Use PDFLoader to process the PDF From 55eb7396a96de688857e837188e5f54f14c2ee92 Mon Sep 17 00:00:00 2001 From: Dhruvin Patel Date: Thu, 12 Dec 2024 20:34:53 -0500 Subject: [PATCH 08/16] Final changes to docs --- .../retrievers/arxiv-retriever.mdx | 91 ++++++++++++++++--- 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index f82b5a76237e..1395fc10d459 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -3,7 +3,7 @@ ## Overview -The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. +The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) --- @@ -14,7 +14,16 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti - Customizable Options: Configure maximum results and output format. --- -## Installation + +## Integration details + +| Retriever | Source | Package | +| ---------------- | ---------------------------- | --------------------------------------- | +| `ArxivRetriever` | Academic articles from arXiv | `@langchain-community/retrievers/arxiv` | + +--- + +## Setup/Installation Ensure the following dependencies are installed: - `pdf-parse` for parsing PDFs @@ -25,20 +34,71 @@ npm install pdf-parse fast-xml-parser ``` --- -## Getting started -#### Import the path -```typescript -import { ArxivRetriever } from "langchain-community/retrievers/arxiv.js"; -``` - -#### Instantiate the retriever +## Instantiate the retriever ```typescript const retriever = new ArxivRetriever({ getFullDocuments: false, // Set to true to fetch full documents (PDFs) maxSearchResults: 5, // Maximum number of results to retrieve }); ``` +--- +## Usage + +Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. + +```typescript +const query = "quantum computing"; + +const documents = await retriever.invoke(query); +documents.forEach(doc => { + console.log("Title:", doc.metadata.title); + console.log("Content:", doc.pageContent); // Parsed PDF content +}); +``` + +--- + +## Use within a chain + +Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain: + +```typescript +import { ChatOpenAI } from "@langchain/openai"; +import { ChatPromptTemplate } from "@langchain/core/prompts"; +import { RunnablePassthrough, RunnableSequence } from "@langchain/core/runnables"; +import { StringOutputParser } from "@langchain/core/output_parsers"; +import type { Document } from "@langchain/core/documents"; + +const llm = new ChatOpenAI({ + model: "gpt-4o-mini", + temperature: 0, +}); + +const prompt = ChatPromptTemplate.fromTemplate(` +Answer the question based only on the context provided. + +Context: {context} + +Question: {question}`); + +const formatDocs = (docs: Document[]) => { + return docs.map((doc) => doc.pageContent).join("\n\n"); +}; + +const ragChain = RunnableSequence.from([ + { + context: retriever.pipe(formatDocs), + question: new RunnablePassthrough(), + }, + prompt, + llm, + new StringOutputParser(), +]); + +await ragChain.invoke("What are the latest advances in quantum computing?"); +``` + --- ## Class: ArxivRetriever @@ -51,7 +111,6 @@ const retriever = new ArxivRetriever({ | `maxSearchResults` | `number` | `10` | Maximum number of results to fetch from arXiv. | - ### Methods ### `invoke(query: string): Promise` @@ -74,4 +133,14 @@ documents.forEach(doc => { console.log("Title:", doc.metadata.title); console.log("Content:", doc.pageContent); // Parsed PDF content }); -``` \ No newline at end of file +``` + +For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) + + + + + + + + From 3ae9fc9484ccc9726afdc6b1a6c3ee258f5fca26 Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Mon, 23 Dec 2024 17:42:26 -0800 Subject: [PATCH 09/16] Update arxiv-retriever.mdx --- .../retrievers/arxiv-retriever.mdx | 63 ++----------------- 1 file changed, 6 insertions(+), 57 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index 1395fc10d459..fdf4804b388a 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -1,27 +1,21 @@ -# ArxivRetriever in LangChain.js (Docs) +# ArxivRetriever --- ## Overview The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) ---- - ## Features - Query Flexibility: Search using natural language queries or specific arXiv IDs. - Full-Document Retrieval: Option to fetch and parse PDFs. - Summaries as Documents: Retrieve summaries for faster results. - Customizable Options: Configure maximum results and output format. ---- - ## Integration details | Retriever | Source | Package | | ---------------- | ---------------------------- | --------------------------------------- | -| `ArxivRetriever` | Academic articles from arXiv | `@langchain-community/retrievers/arxiv` | - ---- +| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) | ## Setup/Installation @@ -29,20 +23,21 @@ Ensure the following dependencies are installed: - `pdf-parse` for parsing PDFs - `fast-xml-parser` for parsing XML responses from the arXiv API -```bash +```npm2yarn npm install pdf-parse fast-xml-parser ``` --- ## Instantiate the retriever + ```typescript const retriever = new ArxivRetriever({ getFullDocuments: false, // Set to true to fetch full documents (PDFs) maxSearchResults: 5, // Maximum number of results to retrieve }); ``` ---- + ## Usage Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. @@ -57,8 +52,6 @@ documents.forEach(doc => { }); ``` ---- - ## Use within a chain Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain: @@ -99,48 +92,4 @@ const ragChain = RunnableSequence.from([ await ragChain.invoke("What are the latest advances in quantum computing?"); ``` ---- - -## Class: ArxivRetriever - -### Parameters - -| Name | Type | Default | Description | -|-------------------|-----------|---------|------------------------------------------------------| -| `getFullDocuments` | `boolean` | `false` | Whether to fetch full documents (PDFs) instead of summaries. | -| `maxSearchResults` | `number` | `10` | Maximum number of results to fetch from arXiv. | - - -### Methods - -### `invoke(query: string): Promise` - -Use the invoke method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. - -#### Parameters - -| Name | Type | Description | -|--------|----------|----------------------------------------| -| `query` | `string` | A natural language query or arXiv ID. | - -#### Returns -A `Promise` that resolves to an array of LangChain `Document` instances. - -#### Example -```typescript -const documents = await retriever.invoke("quantum computing"); -documents.forEach(doc => { - console.log("Title:", doc.metadata.title); - console.log("Content:", doc.pageContent); // Parsed PDF content -}); -``` - -For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) - - - - - - - - +For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) From 58931bf0f49f2e59f5cd089b1cae967098654d26 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 17:53:43 -0800 Subject: [PATCH 10/16] Format, rename, fix docs --- .../retrievers/arxiv-retriever.mdx | 20 +- examples/src/retrievers/arxiv.ts | 18 +- libs/langchain-community/langchain.config.js | 1 + .../src/load/import_constants.ts | 1 + .../src/load/import_map.ts | 1 - .../src/retrievers/arxiv.ts | 62 +- .../src/retrievers/tests/arxiv.int.test.ts | 552 +++++++++--------- libs/langchain-community/src/utils/arxiv.ts | 394 +++++++------ 8 files changed, 527 insertions(+), 522 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index fdf4804b388a..fff4da2a0a2d 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -1,4 +1,5 @@ # ArxivRetriever + --- ## Overview @@ -6,6 +7,7 @@ The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) ## Features + - Query Flexibility: Search using natural language queries or specific arXiv IDs. - Full-Document Retrieval: Option to fetch and parse PDFs. - Summaries as Documents: Retrieve summaries for faster results. @@ -13,28 +15,27 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti ## Integration details -| Retriever | Source | Package | -| ---------------- | ---------------------------- | --------------------------------------- | +| Retriever | Source | Package | +| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- | | `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) | ## Setup/Installation Ensure the following dependencies are installed: + - `pdf-parse` for parsing PDFs - `fast-xml-parser` for parsing XML responses from the arXiv API ```npm2yarn npm install pdf-parse fast-xml-parser ``` ---- - ## Instantiate the retriever ```typescript const retriever = new ArxivRetriever({ - getFullDocuments: false, // Set to true to fetch full documents (PDFs) - maxSearchResults: 5, // Maximum number of results to retrieve + returnFullDocuments: false, // Set to true to fetch full documents (PDFs) + maxSearchResults: 5, // Maximum number of results to retrieve }); ``` @@ -46,7 +47,7 @@ Use the `invoke` method to search arXiv for relevant articles. You can use eithe const query = "quantum computing"; const documents = await retriever.invoke(query); -documents.forEach(doc => { +documents.forEach((doc) => { console.log("Title:", doc.metadata.title); console.log("Content:", doc.pageContent); // Parsed PDF content }); @@ -59,7 +60,10 @@ Like other retrievers, `ArxivRetriever` can be incorporated into LLM application ```typescript import { ChatOpenAI } from "@langchain/openai"; import { ChatPromptTemplate } from "@langchain/core/prompts"; -import { RunnablePassthrough, RunnableSequence } from "@langchain/core/runnables"; +import { + RunnablePassthrough, + RunnableSequence, +} from "@langchain/core/runnables"; import { StringOutputParser } from "@langchain/core/output_parsers"; import type { Document } from "@langchain/core/documents"; diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts index 3c164844474e..fe4a4346371c 100644 --- a/examples/src/retrievers/arxiv.ts +++ b/examples/src/retrievers/arxiv.ts @@ -1,4 +1,4 @@ -import { ArxivRetriever } from "../../../libs/langchain-community/src/retrievers/arxiv.js"; +import { ArxivRetriever } from "@langchain/community/retrievers/arxiv"; export const run = async () => { /* @@ -7,8 +7,8 @@ export const run = async () => { const queryId = "1605.08386 2103.03404"; const retrieverById = new ArxivRetriever({ - getFullDocuments: true, - maxSearchResults: 5 + returnFullDocuments: true, + maxSearchResults: 5, }); const documentsById = await retrieverById.invoke(queryId); console.log(documentsById); @@ -41,12 +41,10 @@ export const run = async () => { */ const queryNat = "What is the ImageBind model?"; - const retrieverByNat = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 2 - } - ); + const retrieverByNat = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 2, + }); const documentsByQuery = await retrieverByNat.invoke(queryNat); console.log(documentsByQuery); @@ -64,4 +62,4 @@ export const run = async () => { } ] */ -}; \ No newline at end of file +}; diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 09d2baf3cab6..f0c1914d5e78 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -438,6 +438,7 @@ export const config = { "chat_models/zhipuai", "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base", + "retrievers/arxiv", "retrievers/dria", "retrievers/metal", "retrievers/supabase", diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index 5930f82690db..014d418e872d 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/callbacks/handlers/upstash_ratelimit", "langchain_community/retrievers/amazon_kendra", "langchain_community/retrievers/amazon_knowledge_base", + "langchain_community/retrievers/arxiv", "langchain_community/retrievers/dria", "langchain_community/retrievers/metal", "langchain_community/retrievers/supabase", diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 3a96bdf2cd8c..2ec7b20bc542 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -57,7 +57,6 @@ export * as chat_models__novita from "../chat_models/novita.js"; export * as chat_models__ollama from "../chat_models/ollama.js"; export * as chat_models__togetherai from "../chat_models/togetherai.js"; export * as chat_models__yandex from "../chat_models/yandex.js"; -export * as retrievers__arxiv from "../retrievers/arxiv.js"; export * as retrievers__bm25 from "../retrievers/bm25.js"; export * as retrievers__chaindesk from "../retrievers/chaindesk.js"; export * as retrievers__databerry from "../retrievers/databerry.js"; diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts index b8bb5a524eea..2eb0ab6c5675 100644 --- a/libs/langchain-community/src/retrievers/arxiv.ts +++ b/libs/langchain-community/src/retrievers/arxiv.ts @@ -1,10 +1,14 @@ import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; import { Document } from "@langchain/core/documents"; -import { searchArxiv, loadDocsFromResults, getDocsFromSummaries } from '../utils/arxiv.js'; +import { + searchArxiv, + loadDocsFromResults, + getDocsFromSummaries, +} from "../utils/arxiv.js"; export type ArxivRetrieverOptions = { - getFullDocuments?: boolean; - maxSearchResults?: number; + returnFullDocuments?: boolean; + maxSearchResults?: number; } & BaseRetrieverInput; /** @@ -12,34 +16,36 @@ export type ArxivRetrieverOptions = { * It can retrieve either full documents (PDFs) or just summaries. */ export class ArxivRetriever extends BaseRetriever { - static lc_name() { - return "ArxivRetriever"; - } + static lc_name() { + return "ArxivRetriever"; + } - lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; + lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; - getFullDocuments: boolean; - maxSearchResults: number; + returnFullDocuments = false; - constructor(options: ArxivRetrieverOptions = {}) { - super(options); - this.getFullDocuments = options.getFullDocuments ?? false; - this.maxSearchResults = options.maxSearchResults ?? 10; - } + maxSearchResults = 10; + + constructor(options: ArxivRetrieverOptions = {}) { + super(options); + this.returnFullDocuments = + options.returnFullDocuments ?? this.returnFullDocuments; + this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults; + } + + async _getRelevantDocuments(query: string): Promise { + try { + const results = await searchArxiv(query, this.maxSearchResults); - async _getRelevantDocuments(query: string): Promise { - try { - const results = await searchArxiv(query, this.maxSearchResults); - - if (this.getFullDocuments) { - // Fetch and parse PDFs to get full documents - return await loadDocsFromResults(results); - } else { - // Use summaries as documents - return getDocsFromSummaries(results); - } - } catch (error) { - throw new Error(`Error retrieving documents from arXiv.`); - } + if (this.returnFullDocuments) { + // Fetch and parse PDFs to get full documents + return await loadDocsFromResults(results); + } else { + // Use summaries as documents + return getDocsFromSummaries(results); + } + } catch (error) { + throw new Error(`Error retrieving documents from arXiv.`); } + } } diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts index 11eb1040ed18..2d8467fe1f57 100644 --- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts +++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts @@ -2,337 +2,317 @@ import { test, expect } from "@jest/globals"; import { ArxivRetriever } from "../arxiv.js"; test("ArxivRetriever fetching document summaries test", async () => { - // Sample integration test for ArxivRetriever using the "machine learning" query - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } + // Sample integration test for ArxivRetriever using the "machine learning" query + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBeGreaterThan(0); - expect(results.length).toBeLessThanOrEqual(5); - - for (let i = 0; i < results.length; i += 1) { - expect(results[i]).toHaveProperty("pageContent"); - expect(results[i].pageContent).toBeDefined(); - - expect(results[i]).toHaveProperty("metadata"); - expect(results[i].metadata).toBeInstanceOf(Object); - expect(results[i].metadata).toHaveProperty("authors"); - expect(results[i].metadata.authors).toBeInstanceOf(Array); - expect(results[i].metadata).toHaveProperty("id"); - expect(results[i].metadata.id).toContain("arxiv.org"); - expect(results[i].metadata).toHaveProperty("published"); - expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("source"); - expect(results[i].metadata.source).toBe("arxiv"); - expect(results[i].metadata).toHaveProperty("title"); - expect(results[i].metadata).toHaveProperty("updated"); - expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("url"); - expect(results[i].metadata.url).toContain("arxiv.org"); - } + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } }); test("ArxivRetriever fetching document summaries with invalid query test", async () => { - // Sample test for ArxivRetriever using an invalid query - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } - ); - const query = "fjalsdkjfw"; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBe(0); + // Sample test for ArxivRetriever using an invalid query + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); }); test("ArxivRetriever fetching document summaries with empty query test", async () => { - // Sample test for ArxivRetriever using an empty query - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } - ); - const query = ""; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBe(0); + // Sample test for ArxivRetriever using an empty query + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); }); test("ArxivRetriever fetching document summaries with invalid maxSearchResults test", async () => { - // Sample test for ArxivRetriever using an invalid maxSearchResults - try { - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: -1 - } - ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: -1, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever fetching document summaries with zero maxSearchResults test", async () => { - // Sample test for ArxivRetriever using an zero maxSearchResults - try { - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: 0 - } - ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: 0, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever fetching full documents test", async () => { - // Sample test for fetching full documents with ArxivRetriever - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: 5 - } + // Sample test for fetching full documents with ArxivRetriever + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: 5, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBeGreaterThan(0); + expect(results.length).toBeLessThanOrEqual(5); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("id"); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBeGreaterThan(0); - expect(results.length).toBeLessThanOrEqual(5); - - for (let i = 0; i < results.length; i += 1) { - expect(results[i]).toHaveProperty("pageContent"); - expect(results[i].pageContent).toBeDefined(); - - expect(results[i]).toHaveProperty("id"); - - expect(results[i]).toHaveProperty("metadata"); - expect(results[i].metadata).toBeInstanceOf(Object); - expect(results[i].metadata).toHaveProperty("authors"); - expect(results[i].metadata.authors).toBeInstanceOf(Array); - expect(results[i].metadata).toHaveProperty("id"); - expect(results[i].metadata.id).toContain("arxiv.org"); - expect(results[i].metadata).toHaveProperty("published"); - expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("source"); - expect(results[i].metadata.source).toBe("arxiv"); - expect(results[i].metadata).toHaveProperty("title"); - expect(results[i].metadata).toHaveProperty("updated"); - expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("url"); - expect(results[i].metadata.url).toContain("arxiv.org"); - expect(results[i].metadata).toHaveProperty("summary"); - } + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("summary"); + } }); test("ArxivRetriever fetching full documents with invalid query test", async () => { - // Sample test for fetching full documents with ArxivRetriever using an invalid query - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: 5 - } - ); - const query = "fjalsdkjfw"; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBe(0); + // Sample test for fetching full documents with ArxivRetriever using an invalid query + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: 5, + }); + const query = "fjalsdkjfw"; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); }); test("ArxivRetriever fetching full documents with empty query test", async () => { - // Sample test for fetching full documents with ArxivRetriever using an empty query - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: 5 - } - ); - const query = ""; - const results = await retriever._getRelevantDocuments(query); - - expect(results).toBeDefined(); - expect(results.length).toBe(0); + // Sample test for fetching full documents with ArxivRetriever using an empty query + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: 5, + }); + const query = ""; + const results = await retriever._getRelevantDocuments(query); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); }); test("ArxivRetriever fetching full documents with invalid maxSearchResults test", async () => { - // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults - try { - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: -1 - } - ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults + try { + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: -1, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever fetching full documents with zero maxSearchResults", async () => { - // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults - try { - const retriever = new ArxivRetriever( - { - getFullDocuments: true, - maxSearchResults: 0 - } - ); - const query = "machine learning"; - const results = await retriever._getRelevantDocuments(query); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults + try { + const retriever = new ArxivRetriever({ + returnFullDocuments: true, + maxSearchResults: 0, + }); + const query = "machine learning"; + const results = await retriever._getRelevantDocuments(query); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever search articles by id test", async () => { - // Sample test for fetching articles by arXiv IDs - const fetchIds = "2103.03404 2103.03405"; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } + // Sample test for fetching articles by arXiv IDs + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(2); + + for (let i = 0; i < results.length; i += 1) { + expect(results[i]).toHaveProperty("pageContent"); + expect(results[i].pageContent).toBeDefined(); + + expect(results[i]).toHaveProperty("metadata"); + expect(results[i].metadata).toBeInstanceOf(Object); + expect(results[i].metadata).toHaveProperty("authors"); + expect(results[i].metadata.authors).toBeInstanceOf(Array); + expect(results[i].metadata).toHaveProperty("id"); + expect(results[i].metadata.id).toContain("arxiv.org"); + expect(results[i].metadata).toHaveProperty("published"); + expect(results[i].metadata.published).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ ); - const results = await retriever.invoke(fetchIds); - - expect(results).toBeDefined(); - expect(results.length).toBe(2); - - for (let i = 0; i < results.length; i += 1) { - expect(results[i]).toHaveProperty("pageContent"); - expect(results[i].pageContent).toBeDefined(); - - expect(results[i]).toHaveProperty("metadata"); - expect(results[i].metadata).toBeInstanceOf(Object); - expect(results[i].metadata).toHaveProperty("authors"); - expect(results[i].metadata.authors).toBeInstanceOf(Array); - expect(results[i].metadata).toHaveProperty("id"); - expect(results[i].metadata.id).toContain("arxiv.org"); - expect(results[i].metadata).toHaveProperty("published"); - expect(results[i].metadata.published).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("source"); - expect(results[i].metadata.source).toBe("arxiv"); - expect(results[i].metadata).toHaveProperty("title"); - expect(results[i].metadata).toHaveProperty("updated"); - expect(results[i].metadata.updated).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/); - expect(results[i].metadata).toHaveProperty("url"); - expect(results[i].metadata.url).toContain("arxiv.org"); - } + expect(results[i].metadata).toHaveProperty("source"); + expect(results[i].metadata.source).toBe("arxiv"); + expect(results[i].metadata).toHaveProperty("title"); + expect(results[i].metadata).toHaveProperty("updated"); + expect(results[i].metadata.updated).toMatch( + /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ + ); + expect(results[i].metadata).toHaveProperty("url"); + expect(results[i].metadata.url).toContain("arxiv.org"); + } }); test("ArxivRetriever search articles by id with invalid id test", async () => { - // Sample test for fetching articles by arXiv IDs with an invalid ID - const fetchIds = "2103.03404 2103.03405 1234.56789"; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } - ); - const results = await retriever.invoke(fetchIds); - - expect(results).toBeDefined(); - expect(results.length).toBeLessThan(3); + // Sample test for fetching articles by arXiv IDs with an invalid ID + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBeLessThan(3); }); test("ArxivRetriever search articles by id with empty id test", async () => { - // Sample test for fetching articles by arXiv IDs with an empty ID - const fetchIds = ""; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 5 - } - ); - const results = await retriever.invoke(fetchIds); - - expect(results).toBeDefined(); - expect(results.length).toBe(0); + // Sample test for fetching articles by arXiv IDs with an empty ID + const fetchIds = ""; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 5, + }); + const results = await retriever.invoke(fetchIds); + + expect(results).toBeDefined(); + expect(results.length).toBe(0); }); test("ArxivRetriever search articles by id with invalid maxSearchResults test", async () => { - // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults - try { - const fetchIds = "2103.03404 2103.03405"; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: -1 - } - ); - const results = await retriever.invoke(fetchIds); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for fetching articles by arXiv IDs with an invalid maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405"; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: -1, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever search articles by id with invalid id and maxSearchResults test", async () => { - // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults - try { - const fetchIds = "2103.03404 2103.03405 1234.56789"; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: -1 - } - ); - const results = await retriever.invoke(fetchIds); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } + // Sample test for fetching articles by arXiv IDs with an invalid ID and maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: -1, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } }); test("ArxivRetriever search articles by id with invalid id and zero maxSearchResults test", async () => { - // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults - try { - const fetchIds = "2103.03404 2103.03405 1234.56789"; - const retriever = new ArxivRetriever( - { - getFullDocuments: false, - maxSearchResults: 0 - } - ); - const results = await retriever.invoke(fetchIds); - expect(results).toBeUndefined(); - expect(results.length).toBe(0); - } catch (error) { - expect(error).toBeDefined(); - expect(error).toBeInstanceOf(Error); - } -}); \ No newline at end of file + // Sample test for fetching articles by arXiv IDs with an invalid ID and zero maxSearchResults + try { + const fetchIds = "2103.03404 2103.03405 1234.56789"; + const retriever = new ArxivRetriever({ + returnFullDocuments: false, + maxSearchResults: 0, + }); + const results = await retriever.invoke(fetchIds); + expect(results).toBeUndefined(); + expect(results.length).toBe(0); + } catch (error) { + expect(error).toBeDefined(); + expect(error).toBeInstanceOf(Error); + } +}); diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts index c4ee12146bcf..9052fad0eb99 100644 --- a/libs/langchain-community/src/utils/arxiv.ts +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -1,225 +1,241 @@ -import { PDFLoader } from "../document_loaders/fs/pdf.js"; -import { XMLParser } from 'fast-xml-parser'; // For parsing XML import { Document } from "@langchain/core/documents"; +import { XMLParser } from "fast-xml-parser"; + +import { PDFLoader } from "../document_loaders/fs/pdf.js"; // Interface for processed arXiv entry interface ArxivEntry { - id: string; - title: string; - summary: string; - published: string; - updated: string; - authors: string[]; - pdfUrl: string; - links: any[]; + id: string; + title: string; + summary: string; + published: string; + updated: string; + authors: string[]; + pdfUrl: string; + links: any[]; } // Used to check if the query is an arXiv ID, or a natural language query export function isArXivIdentifier(query: string): boolean { - const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/; - return arxivIdRegex.test(query.trim()); + const arxivIdRegex = /^\d{4}\.\d{4,5}(v\d+)?$|^\d{7}(\.\d+)?(v\d+)?$/; + return arxivIdRegex.test(query.trim()); } // Used to fetch direct arXiv articles by IDs (supports multiple IDs) -export async function fetchDirectArxivArticle(arxivIds: string): Promise { - try { - const idList = arxivIds.split(/[\s,]+/).map(id => id.trim()).filter(Boolean).join(','); - const url = `http://export.arxiv.org/api/query?id_list=${idList}`; - const response = await fetch(url); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - const xml = await response.text(); - - const parser = new XMLParser({ - ignoreAttributes: false, - attributeNamePrefix: "@_", - }); - const result = parser.parse(xml); - let entries = result.feed.entry; - - if (!entries) { - return []; - } - - // Ensure entries is an array - if (!Array.isArray(entries)) { - entries = [entries]; - } - - const processedEntries = entries.map(processEntry); - - return processedEntries; - } catch (error) { - throw new Error(`Failed to fetch articles with IDs ${arxivIds}`); +export async function fetchDirectArxivArticle( + arxivIds: string +): Promise { + try { + const idList = arxivIds + .split(/[\s,]+/) + .map((id) => id.trim()) + .filter(Boolean) + .join(","); + const url = `http://export.arxiv.org/api/query?id_list=${idList}`; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const xml = await response.text(); + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with IDs ${arxivIds}`); + } } // Used to fetch arXiv results by natural language query with maxResults parameter -export async function fetchArxivResultsByQuery(query: string, start = 0, maxResults = 10): Promise { - try { - const encodedQuery = encodeURIComponent(query); - const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`; - const response = await fetch(url); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - const xml = await response.text(); - - const parser = new XMLParser({ - ignoreAttributes: false, - attributeNamePrefix: "@_", - }); - const result = parser.parse(xml); - let entries = result.feed.entry; - - if (!entries) { - return []; - } - - // Ensure entries is an array - if (!Array.isArray(entries)) { - entries = [entries]; - } - - const processedEntries = entries.map(processEntry); - - return processedEntries; - } catch (error) { - throw new Error(`Failed to fetch articles with query "${query}"`); +export async function fetchArxivResultsByQuery( + query: string, + start = 0, + maxResults = 10 +): Promise { + try { + const encodedQuery = encodeURIComponent(query); + const url = `http://export.arxiv.org/api/query?search_query=all:${encodedQuery}&start=${start}&max_results=${maxResults}`; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); } + + const xml = await response.text(); + + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + }); + const result = parser.parse(xml); + let entries = result.feed.entry; + + if (!entries) { + return []; + } + + // Ensure entries is an array + if (!Array.isArray(entries)) { + entries = [entries]; + } + + const processedEntries = entries.map(processEntry); + + return processedEntries; + } catch (error) { + throw new Error(`Failed to fetch articles with query "${query}"`); + } } // Used to search for arXiv articles with a maxResults parameter -export async function searchArxiv(query: string, maxResults = 3): Promise { - if (isArXivIdentifier(query)) { - return await fetchDirectArxivArticle(query); - } else { - return await fetchArxivResultsByQuery(query, 0, maxResults); - } +export async function searchArxiv( + query: string, + maxResults = 3 +): Promise { + if (isArXivIdentifier(query)) { + return await fetchDirectArxivArticle(query); + } else { + return await fetchArxivResultsByQuery(query, 0, maxResults); + } } // Used to fetch and parse PDF to text export async function fetchAndParsePDF(pdfUrl: string): Promise { - try { - // Fetch the PDF - const response = await fetch(pdfUrl); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - const buffer = await response.arrayBuffer(); - - // Convert the ArrayBuffer to a Blob - const blob = new Blob([buffer], { type: "application/pdf" }); - - // Use PDFLoader to process the PDF - const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob - const docs: Document[] = await loader.load(); - - // Combine all document content into a single string - const content = docs.map((doc) => doc.pageContent).join("\n\n"); - return content; - } catch (error) { - throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`); + try { + // Fetch the PDF + const response = await fetch(pdfUrl); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); } + + const buffer = await response.arrayBuffer(); + + // Convert the ArrayBuffer to a Blob + const blob = new Blob([buffer], { type: "application/pdf" }); + + // Use PDFLoader to process the PDF + const loader = new PDFLoader(blob, { splitPages: false }); // Pass the Blob + const docs: Document[] = await loader.load(); + + // Combine all document content into a single string + const content = docs.map((doc) => doc.pageContent).join("\n\n"); + return content; + } catch (error) { + throw new Error(`Failed to fetch or parse PDF from ${pdfUrl}`); + } } // Used to load raw text from each search result, and convert to Document instances -export async function loadDocsFromResults(results: ArxivEntry[]): Promise { - const docs: Document[] = []; - for (const result of results) { - const pdfUrl = result.pdfUrl; - try { - const pdfContent = await fetchAndParsePDF(pdfUrl); - const metadata = { - id: result.id, - title: result.title, - authors: result.authors, - published: result.published, - updated: result.updated, - source: 'arxiv', - url: result.id, - summary: result.summary, - }; - const doc = new Document({ - pageContent: pdfContent, - metadata, - }); - docs.push(doc); - } catch (error) { - throw new Error(`Error loading document from ${pdfUrl}`); - } +export async function loadDocsFromResults( + results: ArxivEntry[] +): Promise { + const docs: Document[] = []; + for (const result of results) { + const pdfUrl = result.pdfUrl; + try { + const pdfContent = await fetchAndParsePDF(pdfUrl); + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: "arxiv", + url: result.id, + summary: result.summary, + }; + const doc = new Document({ + pageContent: pdfContent, + metadata, + }); + docs.push(doc); + } catch (error) { + throw new Error(`Error loading document from ${pdfUrl}`); } - return docs; + } + return docs; } // Used to convert metadata and summaries to Document instances export function getDocsFromSummaries(results: ArxivEntry[]): Document[] { - const docs: Document[] = []; - for (const result of results) { - const metadata = { - id: result.id, - title: result.title, - authors: result.authors, - published: result.published, - updated: result.updated, - source: 'arxiv', - url: result.id, - }; - const doc = new Document({ - pageContent: result.summary, - metadata, - }); - docs.push(doc); - } - return docs; + const docs: Document[] = []; + for (const result of results) { + const metadata = { + id: result.id, + title: result.title, + authors: result.authors, + published: result.published, + updated: result.updated, + source: "arxiv", + url: result.id, + }; + const doc = new Document({ + pageContent: result.summary, + metadata, + }); + docs.push(doc); + } + return docs; } // Helper function to process each arXiv entry function processEntry(entry: any): ArxivEntry { - const id = entry.id; - const title = entry.title.replace(/\s+/g, ' ').trim(); - const summary = entry.summary.replace(/\s+/g, ' ').trim(); - const published = entry.published; - const updated = entry.updated; - - // Extract authors - let authors: string[] = []; - if (Array.isArray(entry.author)) { - authors = entry.author.map((author: any) => author.name); - } else if (entry.author) { - authors = [entry.author.name]; - } - - // Extract links - let links: any[] = []; - if (Array.isArray(entry.link)) { - links = entry.link; - } else if (entry.link) { - links = [entry.link]; - } - - // Extract PDF link - let pdfUrl = id.replace('/abs/', '/pdf/') + '.pdf'; - const pdfLinkObj = links.find((link: any) => link["@_title"] === 'pdf'); - if (pdfLinkObj && pdfLinkObj["@_href"]) { - pdfUrl = pdfLinkObj["@_href"]; - } - - return { - id, - title, - summary, - published, - updated, - authors, - pdfUrl, - links, - }; -} \ No newline at end of file + const id = entry.id; + const title = entry.title.replace(/\s+/g, " ").trim(); + const summary = entry.summary.replace(/\s+/g, " ").trim(); + const published = entry.published; + const updated = entry.updated; + + // Extract authors + let authors: string[] = []; + if (Array.isArray(entry.author)) { + authors = entry.author.map((author: any) => author.name); + } else if (entry.author) { + authors = [entry.author.name]; + } + + // Extract links + let links: any[] = []; + if (Array.isArray(entry.link)) { + links = entry.link; + } else if (entry.link) { + links = [entry.link]; + } + + // Extract PDF link + let pdfUrl = id.replace("/abs/", "/pdf/") + ".pdf"; + const pdfLinkObj = links.find((link: any) => link["@_title"] === "pdf"); + if (pdfLinkObj && pdfLinkObj["@_href"]) { + pdfUrl = pdfLinkObj["@_href"]; + } + + return { + id, + title, + summary, + published, + updated, + authors, + pdfUrl, + links, + }; +} From 20cd43cded2f26334949a392433bf79096fb8c05 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 17:54:51 -0800 Subject: [PATCH 11/16] Rename --- .../retrievers/arxiv-retriever.mdx | 2 +- examples/src/retrievers/arxiv.ts | 4 +-- .../src/retrievers/arxiv.ts | 9 +++--- .../src/retrievers/tests/arxiv.int.test.ts | 32 +++++++++---------- 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index fff4da2a0a2d..8f85886c38aa 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -34,7 +34,7 @@ npm install pdf-parse fast-xml-parser ```typescript const retriever = new ArxivRetriever({ - returnFullDocuments: false, // Set to true to fetch full documents (PDFs) + getFullDocuments: false, // Set to true to fetch full documents (PDFs) maxSearchResults: 5, // Maximum number of results to retrieve }); ``` diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts index fe4a4346371c..3e74502e7d49 100644 --- a/examples/src/retrievers/arxiv.ts +++ b/examples/src/retrievers/arxiv.ts @@ -7,7 +7,7 @@ export const run = async () => { const queryId = "1605.08386 2103.03404"; const retrieverById = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 5, }); const documentsById = await retrieverById.invoke(queryId); @@ -42,7 +42,7 @@ export const run = async () => { const queryNat = "What is the ImageBind model?"; const retrieverByNat = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 2, }); const documentsByQuery = await retrieverByNat.invoke(queryNat); diff --git a/libs/langchain-community/src/retrievers/arxiv.ts b/libs/langchain-community/src/retrievers/arxiv.ts index 2eb0ab6c5675..8009ce9f8320 100644 --- a/libs/langchain-community/src/retrievers/arxiv.ts +++ b/libs/langchain-community/src/retrievers/arxiv.ts @@ -7,7 +7,7 @@ import { } from "../utils/arxiv.js"; export type ArxivRetrieverOptions = { - returnFullDocuments?: boolean; + getFullDocuments?: boolean; maxSearchResults?: number; } & BaseRetrieverInput; @@ -22,14 +22,13 @@ export class ArxivRetriever extends BaseRetriever { lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; - returnFullDocuments = false; + getFullDocuments = false; maxSearchResults = 10; constructor(options: ArxivRetrieverOptions = {}) { super(options); - this.returnFullDocuments = - options.returnFullDocuments ?? this.returnFullDocuments; + this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments; this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults; } @@ -37,7 +36,7 @@ export class ArxivRetriever extends BaseRetriever { try { const results = await searchArxiv(query, this.maxSearchResults); - if (this.returnFullDocuments) { + if (this.getFullDocuments) { // Fetch and parse PDFs to get full documents return await loadDocsFromResults(results); } else { diff --git a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts index 2d8467fe1f57..bb05f11504e5 100644 --- a/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts +++ b/libs/langchain-community/src/retrievers/tests/arxiv.int.test.ts @@ -4,7 +4,7 @@ import { ArxivRetriever } from "../arxiv.js"; test("ArxivRetriever fetching document summaries test", async () => { // Sample integration test for ArxivRetriever using the "machine learning" query const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const query = "machine learning"; @@ -43,7 +43,7 @@ test("ArxivRetriever fetching document summaries test", async () => { test("ArxivRetriever fetching document summaries with invalid query test", async () => { // Sample test for ArxivRetriever using an invalid query const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const query = "fjalsdkjfw"; @@ -56,7 +56,7 @@ test("ArxivRetriever fetching document summaries with invalid query test", async test("ArxivRetriever fetching document summaries with empty query test", async () => { // Sample test for ArxivRetriever using an empty query const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const query = ""; @@ -70,7 +70,7 @@ test("ArxivRetriever fetching document summaries with invalid maxSearchResults t // Sample test for ArxivRetriever using an invalid maxSearchResults try { const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: -1, }); const query = "machine learning"; @@ -87,7 +87,7 @@ test("ArxivRetriever fetching document summaries with zero maxSearchResults test // Sample test for ArxivRetriever using an zero maxSearchResults try { const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 0, }); const query = "machine learning"; @@ -103,7 +103,7 @@ test("ArxivRetriever fetching document summaries with zero maxSearchResults test test("ArxivRetriever fetching full documents test", async () => { // Sample test for fetching full documents with ArxivRetriever const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 5, }); const query = "machine learning"; @@ -145,7 +145,7 @@ test("ArxivRetriever fetching full documents test", async () => { test("ArxivRetriever fetching full documents with invalid query test", async () => { // Sample test for fetching full documents with ArxivRetriever using an invalid query const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 5, }); const query = "fjalsdkjfw"; @@ -158,7 +158,7 @@ test("ArxivRetriever fetching full documents with invalid query test", async () test("ArxivRetriever fetching full documents with empty query test", async () => { // Sample test for fetching full documents with ArxivRetriever using an empty query const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 5, }); const query = ""; @@ -172,7 +172,7 @@ test("ArxivRetriever fetching full documents with invalid maxSearchResults test" // Sample test for fetching full documents with ArxivRetriever using an invalid maxSearchResults try { const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: -1, }); const query = "machine learning"; @@ -189,7 +189,7 @@ test("ArxivRetriever fetching full documents with zero maxSearchResults", async // Sample test for fetching full documents with ArxivRetriever using an zero maxSearchResults try { const retriever = new ArxivRetriever({ - returnFullDocuments: true, + getFullDocuments: true, maxSearchResults: 0, }); const query = "machine learning"; @@ -206,7 +206,7 @@ test("ArxivRetriever search articles by id test", async () => { // Sample test for fetching articles by arXiv IDs const fetchIds = "2103.03404 2103.03405"; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const results = await retriever.invoke(fetchIds); @@ -244,7 +244,7 @@ test("ArxivRetriever search articles by id with invalid id test", async () => { // Sample test for fetching articles by arXiv IDs with an invalid ID const fetchIds = "2103.03404 2103.03405 1234.56789"; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const results = await retriever.invoke(fetchIds); @@ -257,7 +257,7 @@ test("ArxivRetriever search articles by id with empty id test", async () => { // Sample test for fetching articles by arXiv IDs with an empty ID const fetchIds = ""; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 5, }); const results = await retriever.invoke(fetchIds); @@ -271,7 +271,7 @@ test("ArxivRetriever search articles by id with invalid maxSearchResults test", try { const fetchIds = "2103.03404 2103.03405"; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: -1, }); const results = await retriever.invoke(fetchIds); @@ -288,7 +288,7 @@ test("ArxivRetriever search articles by id with invalid id and maxSearchResults try { const fetchIds = "2103.03404 2103.03405 1234.56789"; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: -1, }); const results = await retriever.invoke(fetchIds); @@ -305,7 +305,7 @@ test("ArxivRetriever search articles by id with invalid id and zero maxSearchRes try { const fetchIds = "2103.03404 2103.03405 1234.56789"; const retriever = new ArxivRetriever({ - returnFullDocuments: false, + getFullDocuments: false, maxSearchResults: 0, }); const results = await retriever.invoke(fetchIds); From a630c44f8b3504e3c81b33d591eb718c2ab12240 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 17:56:42 -0800 Subject: [PATCH 12/16] Fix --- .../docs/integrations/retrievers/arxiv-retriever.mdx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index 8f85886c38aa..2cdefddbbb1c 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -19,7 +19,7 @@ The `arXiv Retriever` allows users to query the arXiv database for academic arti | ---------------- | ---------------------------- | ---------------------------------------------------------------------------- | | `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) | -## Setup/Installation +## Setup Ensure the following dependencies are installed: @@ -30,7 +30,7 @@ Ensure the following dependencies are installed: npm install pdf-parse fast-xml-parser ``` -## Instantiate the retriever +## Instantiation ```typescript const retriever = new ArxivRetriever({ @@ -96,4 +96,6 @@ const ragChain = RunnableSequence.from([ await ragChain.invoke("What are the latest advances in quantum computing?"); ``` +## API reference + For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) From 7c4f09f9a0aa7988f198ebf32a369d205bed8edb Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 18:12:28 -0800 Subject: [PATCH 13/16] Add optional dep --- libs/langchain-community/package.json | 5 +++++ yarn.lock | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 447a3eda9484..1a45528ec5b6 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -166,6 +166,7 @@ "eslint-plugin-no-instanceof": "^1.0.1", "eslint-plugin-prettier": "^4.2.1", "faiss-node": "^0.5.1", + "fast-xml-parser": "^4.5.1", "firebase-admin": "^11.9.0 || ^12.0.0", "google-auth-library": "^9.10.0", "googleapis": "^126.0.1", @@ -302,6 +303,7 @@ "duck-duck-scrape": "^2.2.5", "epub2": "^3.0.1", "faiss-node": "^0.5.1", + "fast-xml-parser": "*", "firebase-admin": "^11.9.0 || ^12.0.0", "google-auth-library": "*", "googleapis": "*", @@ -584,6 +586,9 @@ "faiss-node": { "optional": true }, + "fast-xml-parser": { + "optional": true + }, "firebase-admin": { "optional": true }, diff --git a/yarn.lock b/yarn.lock index 9e5a48455320..abae3190907d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11908,6 +11908,7 @@ __metadata: eslint-plugin-prettier: ^4.2.1 expr-eval: ^2.0.2 faiss-node: ^0.5.1 + fast-xml-parser: ^4.5.1 firebase-admin: ^11.9.0 || ^12.0.0 flat: ^5.0.2 google-auth-library: ^9.10.0 @@ -12050,6 +12051,7 @@ __metadata: duck-duck-scrape: ^2.2.5 epub2: ^3.0.1 faiss-node: ^0.5.1 + fast-xml-parser: "*" firebase-admin: ^11.9.0 || ^12.0.0 google-auth-library: "*" googleapis: "*" @@ -12252,6 +12254,8 @@ __metadata: optional: true faiss-node: optional: true + fast-xml-parser: + optional: true firebase-admin: optional: true google-auth-library: @@ -28227,6 +28231,17 @@ __metadata: languageName: node linkType: hard +"fast-xml-parser@npm:^4.5.1": + version: 4.5.1 + resolution: "fast-xml-parser@npm:4.5.1" + dependencies: + strnum: ^1.0.5 + bin: + fxparser: src/cli/cli.js + checksum: aab32d7f08a95b20f9ecdc2d769531a9dc454faf12740873972f8169c04ab9335ac5df1029ebfe829a01ddbb0ec60572cb7769d6be2409e95a9be8fc6a86e92c + languageName: node + linkType: hard + "fastq@npm:^1.6.0": version: 1.15.0 resolution: "fastq@npm:1.15.0" From b640c3917aa9cc54810ba75f76006f9e6995fa51 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 18:14:05 -0800 Subject: [PATCH 14/16] Lint --- libs/langchain-community/src/utils/arxiv.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/langchain-community/src/utils/arxiv.ts b/libs/langchain-community/src/utils/arxiv.ts index 9052fad0eb99..6a79b78a776a 100644 --- a/libs/langchain-community/src/utils/arxiv.ts +++ b/libs/langchain-community/src/utils/arxiv.ts @@ -1,3 +1,4 @@ +/* eslint-disable import/no-extraneous-dependencies */ import { Document } from "@langchain/core/documents"; import { XMLParser } from "fast-xml-parser"; From 7e49ac24f07941d961c80109bd8284597f0c2f0d Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 18:14:39 -0800 Subject: [PATCH 15/16] Fix --- docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index 2cdefddbbb1c..cb4ad949dd1a 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -1,7 +1,5 @@ # ArxivRetriever ---- - ## Overview The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) From 25a96f6f770fef374c56c02058f05c84ae8247fd Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 23 Dec 2024 18:39:22 -0800 Subject: [PATCH 16/16] Fix docs --- .../docs/integrations/retrievers/arxiv-retriever.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx index cb4ad949dd1a..254c90ca49fe 100644 --- a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx +++ b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx @@ -1,8 +1,8 @@ # ArxivRetriever -## Overview +The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. -The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/) +For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) ## Features