-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(community): Port ArxivRetriever to LangChainJS (#7250)
Co-authored-by: Antonio Ferreras <[email protected]> Co-authored-by: Dhruvin Patel <[email protected]> Co-authored-by: Yiran Gogo Yu <[email protected]> Co-authored-by: Jacob Lee <[email protected]>
- Loading branch information
1 parent
a7dd5d2
commit be3fc04
Showing
10 changed files
with
814 additions
and
0 deletions.
There are no files selected for viewing
99 changes: 99 additions & 0 deletions
99
docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# ArxivRetriever | ||
|
||
The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. | ||
|
||
For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) | ||
|
||
## Features | ||
|
||
- Query Flexibility: Search using natural language queries or specific arXiv IDs. | ||
- Full-Document Retrieval: Option to fetch and parse PDFs. | ||
- Summaries as Documents: Retrieve summaries for faster results. | ||
- Customizable Options: Configure maximum results and output format. | ||
|
||
## Integration details | ||
|
||
| Retriever | Source | Package | | ||
| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- | | ||
| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) | | ||
|
||
## Setup | ||
|
||
Ensure the following dependencies are installed: | ||
|
||
- `pdf-parse` for parsing PDFs | ||
- `fast-xml-parser` for parsing XML responses from the arXiv API | ||
|
||
```npm2yarn | ||
npm install pdf-parse fast-xml-parser | ||
``` | ||
|
||
## Instantiation | ||
|
||
```typescript | ||
const retriever = new ArxivRetriever({ | ||
getFullDocuments: false, // Set to true to fetch full documents (PDFs) | ||
maxSearchResults: 5, // Maximum number of results to retrieve | ||
}); | ||
``` | ||
|
||
## Usage | ||
|
||
Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs. | ||
|
||
```typescript | ||
const query = "quantum computing"; | ||
|
||
const documents = await retriever.invoke(query); | ||
documents.forEach((doc) => { | ||
console.log("Title:", doc.metadata.title); | ||
console.log("Content:", doc.pageContent); // Parsed PDF content | ||
}); | ||
``` | ||
|
||
## Use within a chain | ||
|
||
Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain: | ||
|
||
```typescript | ||
import { ChatOpenAI } from "@langchain/openai"; | ||
import { ChatPromptTemplate } from "@langchain/core/prompts"; | ||
import { | ||
RunnablePassthrough, | ||
RunnableSequence, | ||
} from "@langchain/core/runnables"; | ||
import { StringOutputParser } from "@langchain/core/output_parsers"; | ||
import type { Document } from "@langchain/core/documents"; | ||
|
||
const llm = new ChatOpenAI({ | ||
model: "gpt-4o-mini", | ||
temperature: 0, | ||
}); | ||
|
||
const prompt = ChatPromptTemplate.fromTemplate(` | ||
Answer the question based only on the context provided. | ||
Context: {context} | ||
Question: {question}`); | ||
|
||
const formatDocs = (docs: Document[]) => { | ||
return docs.map((doc) => doc.pageContent).join("\n\n"); | ||
}; | ||
|
||
const ragChain = RunnableSequence.from([ | ||
{ | ||
context: retriever.pipe(formatDocs), | ||
question: new RunnablePassthrough(), | ||
}, | ||
prompt, | ||
llm, | ||
new StringOutputParser(), | ||
]); | ||
|
||
await ragChain.invoke("What are the latest advances in quantum computing?"); | ||
``` | ||
|
||
## API reference | ||
|
||
For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import { ArxivRetriever } from "@langchain/community/retrievers/arxiv"; | ||
|
||
export const run = async () => { | ||
/* | ||
Direct look up by arXiv ID, for full texts | ||
*/ | ||
|
||
const queryId = "1605.08386 2103.03404"; | ||
const retrieverById = new ArxivRetriever({ | ||
getFullDocuments: true, | ||
maxSearchResults: 5, | ||
}); | ||
const documentsById = await retrieverById.invoke(queryId); | ||
console.log(documentsById); | ||
|
||
/* | ||
[ | ||
Document | ||
{ | ||
pageContent, | ||
metadata: | ||
{ | ||
author, | ||
id, | ||
published, | ||
source, | ||
updated, | ||
url | ||
} | ||
}, | ||
Document | ||
{ | ||
pageContent, | ||
metadata | ||
} | ||
] | ||
*/ | ||
|
||
/* | ||
Search with natural language query, for summaries | ||
*/ | ||
|
||
const queryNat = "What is the ImageBind model?"; | ||
const retrieverByNat = new ArxivRetriever({ | ||
getFullDocuments: false, | ||
maxSearchResults: 2, | ||
}); | ||
const documentsByQuery = await retrieverByNat.invoke(queryNat); | ||
console.log(documentsByQuery); | ||
|
||
/* | ||
[ | ||
Document | ||
{ | ||
pageContent, | ||
metadata | ||
}, | ||
Document | ||
{ | ||
pageContent, | ||
metadata | ||
} | ||
] | ||
*/ | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; | ||
import { Document } from "@langchain/core/documents"; | ||
import { | ||
searchArxiv, | ||
loadDocsFromResults, | ||
getDocsFromSummaries, | ||
} from "../utils/arxiv.js"; | ||
|
||
export type ArxivRetrieverOptions = { | ||
getFullDocuments?: boolean; | ||
maxSearchResults?: number; | ||
} & BaseRetrieverInput; | ||
|
||
/** | ||
* A retriever that searches arXiv for relevant articles based on a query. | ||
* It can retrieve either full documents (PDFs) or just summaries. | ||
*/ | ||
export class ArxivRetriever extends BaseRetriever { | ||
static lc_name() { | ||
return "ArxivRetriever"; | ||
} | ||
|
||
lc_namespace = ["langchain", "retrievers", "arxiv_retriever"]; | ||
|
||
getFullDocuments = false; | ||
|
||
maxSearchResults = 10; | ||
|
||
constructor(options: ArxivRetrieverOptions = {}) { | ||
super(options); | ||
this.getFullDocuments = options.getFullDocuments ?? this.getFullDocuments; | ||
this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults; | ||
} | ||
|
||
async _getRelevantDocuments(query: string): Promise<Document[]> { | ||
try { | ||
const results = await searchArxiv(query, this.maxSearchResults); | ||
|
||
if (this.getFullDocuments) { | ||
// Fetch and parse PDFs to get full documents | ||
return await loadDocsFromResults(results); | ||
} else { | ||
// Use summaries as documents | ||
return getDocsFromSummaries(results); | ||
} | ||
} catch (error) { | ||
throw new Error(`Error retrieving documents from arXiv.`); | ||
} | ||
} | ||
} |
Oops, something went wrong.