Skip to content

Commit

Permalink
feat(community): Port ArxivRetriever to LangChainJS (#7250)
Browse files Browse the repository at this point in the history
Co-authored-by: Antonio Ferreras <[email protected]>
Co-authored-by: Dhruvin Patel <[email protected]>
Co-authored-by: Yiran Gogo Yu <[email protected]>
Co-authored-by: Jacob Lee <[email protected]>
  • Loading branch information
5 people authored Dec 24, 2024
1 parent a7dd5d2 commit be3fc04
Show file tree
Hide file tree
Showing 10 changed files with 814 additions and 0 deletions.
99 changes: 99 additions & 0 deletions docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# ArxivRetriever

The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.

For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)

## Features

- Query Flexibility: Search using natural language queries or specific arXiv IDs.
- Full-Document Retrieval: Option to fetch and parse PDFs.
- Summaries as Documents: Retrieve summaries for faster results.
- Customizable Options: Configure maximum results and output format.

## Integration details

| Retriever | Source | Package |
| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |

## Setup

Ensure the following dependencies are installed:

- `pdf-parse` for parsing PDFs
- `fast-xml-parser` for parsing XML responses from the arXiv API

```npm2yarn
npm install pdf-parse fast-xml-parser
```

## Instantiation

```typescript
const retriever = new ArxivRetriever({
getFullDocuments: false, // Set to true to fetch full documents (PDFs)
maxSearchResults: 5, // Maximum number of results to retrieve
});
```

## Usage

Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.

```typescript
const query = "quantum computing";

const documents = await retriever.invoke(query);
documents.forEach((doc) => {
console.log("Title:", doc.metadata.title);
console.log("Content:", doc.pageContent); // Parsed PDF content
});
```

## Use within a chain

Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain:

```typescript
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import {
RunnablePassthrough,
RunnableSequence,
} from "@langchain/core/runnables";
import { StringOutputParser } from "@langchain/core/output_parsers";
import type { Document } from "@langchain/core/documents";

const llm = new ChatOpenAI({
model: "gpt-4o-mini",
temperature: 0,
});

const prompt = ChatPromptTemplate.fromTemplate(`
Answer the question based only on the context provided.
Context: {context}
Question: {question}`);

const formatDocs = (docs: Document[]) => {
return docs.map((doc) => doc.pageContent).join("\n\n");
};

const ragChain = RunnableSequence.from([
{
context: retriever.pipe(formatDocs),
question: new RunnablePassthrough(),
},
prompt,
llm,
new StringOutputParser(),
]);

await ragChain.invoke("What are the latest advances in quantum computing?");
```

## API reference

For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
65 changes: 65 additions & 0 deletions examples/src/retrievers/arxiv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { ArxivRetriever } from "@langchain/community/retrievers/arxiv";

export const run = async () => {
/*
Direct look up by arXiv ID, for full texts
*/

const queryId = "1605.08386 2103.03404";
const retrieverById = new ArxivRetriever({
getFullDocuments: true,
maxSearchResults: 5,
});
const documentsById = await retrieverById.invoke(queryId);
console.log(documentsById);

/*
[
Document
{
pageContent,
metadata:
{
author,
id,
published,
source,
updated,
url
}
},
Document
{
pageContent,
metadata
}
]
*/

/*
Search with natural language query, for summaries
*/

const queryNat = "What is the ImageBind model?";
const retrieverByNat = new ArxivRetriever({
getFullDocuments: false,
maxSearchResults: 2,
});
const documentsByQuery = await retrieverByNat.invoke(queryNat);
console.log(documentsByQuery);

/*
[
Document
{
pageContent,
metadata
},
Document
{
pageContent,
metadata
}
]
*/
};
4 changes: 4 additions & 0 deletions libs/langchain-community/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,10 @@ retrievers/amazon_knowledge_base.cjs
retrievers/amazon_knowledge_base.js
retrievers/amazon_knowledge_base.d.ts
retrievers/amazon_knowledge_base.d.cts
retrievers/arxiv.cjs
retrievers/arxiv.js
retrievers/arxiv.d.ts
retrievers/arxiv.d.cts
retrievers/bm25.cjs
retrievers/bm25.js
retrievers/bm25.d.ts
Expand Down
2 changes: 2 additions & 0 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ export const config = {
// retrievers
"retrievers/amazon_kendra": "retrievers/amazon_kendra",
"retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
"retrievers/arxiv": "retrievers/arxiv",
"retrievers/bm25": "retrievers/bm25",
"retrievers/chaindesk": "retrievers/chaindesk",
"retrievers/databerry": "retrievers/databerry",
Expand Down Expand Up @@ -437,6 +438,7 @@ export const config = {
"chat_models/zhipuai",
"retrievers/amazon_kendra",
"retrievers/amazon_knowledge_base",
"retrievers/arxiv",
"retrievers/dria",
"retrievers/metal",
"retrievers/supabase",
Expand Down
18 changes: 18 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@
"eslint-plugin-no-instanceof": "^1.0.1",
"eslint-plugin-prettier": "^4.2.1",
"faiss-node": "^0.5.1",
"fast-xml-parser": "^4.5.1",
"firebase-admin": "^11.9.0 || ^12.0.0",
"google-auth-library": "^9.10.0",
"googleapis": "^126.0.1",
Expand Down Expand Up @@ -302,6 +303,7 @@
"duck-duck-scrape": "^2.2.5",
"epub2": "^3.0.1",
"faiss-node": "^0.5.1",
"fast-xml-parser": "*",
"firebase-admin": "^11.9.0 || ^12.0.0",
"google-auth-library": "*",
"googleapis": "*",
Expand Down Expand Up @@ -584,6 +586,9 @@
"faiss-node": {
"optional": true
},
"fast-xml-parser": {
"optional": true
},
"firebase-admin": {
"optional": true
},
Expand Down Expand Up @@ -2125,6 +2130,15 @@
"import": "./retrievers/amazon_knowledge_base.js",
"require": "./retrievers/amazon_knowledge_base.cjs"
},
"./retrievers/arxiv": {
"types": {
"import": "./retrievers/arxiv.d.ts",
"require": "./retrievers/arxiv.d.cts",
"default": "./retrievers/arxiv.d.ts"
},
"import": "./retrievers/arxiv.js",
"require": "./retrievers/arxiv.cjs"
},
"./retrievers/bm25": {
"types": {
"import": "./retrievers/bm25.d.ts",
Expand Down Expand Up @@ -3774,6 +3788,10 @@
"retrievers/amazon_knowledge_base.js",
"retrievers/amazon_knowledge_base.d.ts",
"retrievers/amazon_knowledge_base.d.cts",
"retrievers/arxiv.cjs",
"retrievers/arxiv.js",
"retrievers/arxiv.d.ts",
"retrievers/arxiv.d.cts",
"retrievers/bm25.cjs",
"retrievers/bm25.js",
"retrievers/bm25.d.ts",
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/src/load/import_constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain_community/callbacks/handlers/upstash_ratelimit",
"langchain_community/retrievers/amazon_kendra",
"langchain_community/retrievers/amazon_knowledge_base",
"langchain_community/retrievers/arxiv",
"langchain_community/retrievers/dria",
"langchain_community/retrievers/metal",
"langchain_community/retrievers/supabase",
Expand Down
50 changes: 50 additions & 0 deletions libs/langchain-community/src/retrievers/arxiv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
import { Document } from "@langchain/core/documents";
import {
searchArxiv,
loadDocsFromResults,
getDocsFromSummaries,
} from "../utils/arxiv.js";

export type ArxivRetrieverOptions = {
getFullDocuments?: boolean;
maxSearchResults?: number;
} & BaseRetrieverInput;

/**
* A retriever that searches arXiv for relevant articles based on a query.
* It can retrieve either full documents (PDFs) or just summaries.
*/
export class ArxivRetriever extends BaseRetriever {
static lc_name() {
return "ArxivRetriever";
}

lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];

getFullDocuments = false;

maxSearchResults = 10;

constructor(options: ArxivRetrieverOptions = {}) {
super(options);
this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments;
this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults;
}

async _getRelevantDocuments(query: string): Promise<Document[]> {
try {
const results = await searchArxiv(query, this.maxSearchResults);

if (this.getFullDocuments) {
// Fetch and parse PDFs to get full documents
return await loadDocsFromResults(results);
} else {
// Use summaries as documents
return getDocsFromSummaries(results);
}
} catch (error) {
throw new Error(`Error retrieving documents from arXiv.`);
}
}
}
Loading

0 comments on commit be3fc04

Please sign in to comment.