Merge branch 'main' into feature/add-boolean-attribute-support

langchain-ai · Dec 27, 2024 · 356be4e · 356be4e
2 parents b1ead94 + 4549863
commit 356be4e
Show file tree

Hide file tree

Showing 18 changed files with 860 additions and 28 deletions.
diff --git a/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx b/docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
@@ -0,0 +1,99 @@
+# ArxivRetriever
+
+The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval.
+
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
+
+## Features
+
+- Query Flexibility: Search using natural language queries or specific arXiv IDs.
+- Full-Document Retrieval: Option to fetch and parse PDFs.
+- Summaries as Documents: Retrieve summaries for faster results.
+- Customizable Options: Configure maximum results and output format.
+
+## Integration details
+
+| Retriever        | Source                       | Package                                                                      |
+| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
+| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |
+
+## Setup
+
+Ensure the following dependencies are installed:
+
+- `pdf-parse` for parsing PDFs
+- `fast-xml-parser` for parsing XML responses from the arXiv API
+
+```npm2yarn
+npm install pdf-parse fast-xml-parser
+```
+
+## Instantiation
+
+```typescript
+const retriever = new ArxivRetriever({
+  getFullDocuments: false, // Set to true to fetch full documents (PDFs)
+  maxSearchResults: 5, // Maximum number of results to retrieve
+});
+```
+
+## Usage
+
+Use the `invoke` method to search arXiv for relevant articles. You can use either natural language queries or specific arXiv IDs.
+
+```typescript
+const query = "quantum computing";
+
+const documents = await retriever.invoke(query);
+documents.forEach((doc) => {
+  console.log("Title:", doc.metadata.title);
+  console.log("Content:", doc.pageContent); // Parsed PDF content
+});
+```
+
+## Use within a chain
+
+Like other retrievers, `ArxivRetriever` can be incorporated into LLM applications via chains. Below is an example of using the retriever within a chain:
+
+```typescript
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import {
+  RunnablePassthrough,
+  RunnableSequence,
+} from "@langchain/core/runnables";
+import { StringOutputParser } from "@langchain/core/output_parsers";
+import type { Document } from "@langchain/core/documents";
+
+const llm = new ChatOpenAI({
+  model: "gpt-4o-mini",
+  temperature: 0,
+});
+
+const prompt = ChatPromptTemplate.fromTemplate(`
+Answer the question based only on the context provided.
+
+Context: {context}
+
+Question: {question}`);
+
+const formatDocs = (docs: Document[]) => {
+  return docs.map((doc) => doc.pageContent).join("\n\n");
+};
+
+const ragChain = RunnableSequence.from([
+  {
+    context: retriever.pipe(formatDocs),
+    question: new RunnablePassthrough(),
+  },
+  prompt,
+  llm,
+  new StringOutputParser(),
+]);
+
+await ragChain.invoke("What are the latest advances in quantum computing?");
+```
+
+## API reference
+
+For detailed documentation of all ArxivRetriever features and configurations, head to the [API reference](https://api.js.langchain.com/classes/_langchain_community.retrievers_arxiv.ArxivRetriever.html)
diff --git a/examples/src/cache/chat_models/upstash_redis.ts b/examples/src/cache/chat_models/upstash_redis.ts
@@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({
     url: "UPSTASH_REDIS_REST_URL",
     token: "UPSTASH_REDIS_REST_TOKEN",
   },
+  ttl: 3600,
 });
 
 const model = new ChatOpenAI({ cache });
diff --git a/examples/src/cache/upstash_redis.ts b/examples/src/cache/upstash_redis.ts
@@ -7,6 +7,7 @@ const cache = new UpstashRedisCache({
     url: "UPSTASH_REDIS_REST_URL",
     token: "UPSTASH_REDIS_REST_TOKEN",
   },
+  ttl: 3600,
 });
 
 const model = new OpenAI({ cache });
diff --git a/examples/src/retrievers/arxiv.ts b/examples/src/retrievers/arxiv.ts
@@ -0,0 +1,65 @@
+import { ArxivRetriever } from "@langchain/community/retrievers/arxiv";
+
+export const run = async () => {
+  /*
+    Direct look up by arXiv ID, for full texts
+  */
+
+  const queryId = "1605.08386 2103.03404";
+  const retrieverById = new ArxivRetriever({
+    getFullDocuments: true,
+    maxSearchResults: 5,
+  });
+  const documentsById = await retrieverById.invoke(queryId);
+  console.log(documentsById);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata: 
+      {
+        author,
+        id,
+        published,
+        source,
+        updated,
+        url
+      }
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+
+  /*
+  Search with natural language query, for summaries
+  */
+
+  const queryNat = "What is the ImageBind model?";
+  const retrieverByNat = new ArxivRetriever({
+    getFullDocuments: false,
+    maxSearchResults: 2,
+  });
+  const documentsByQuery = await retrieverByNat.invoke(queryNat);
+  console.log(documentsByQuery);
+
+  /*
+  [
+    Document
+    {
+      pageContent,
+      metadata
+    },
+    Document
+    {
+      pageContent,
+      metadata
+    }
+  ]
+  */
+};
diff --git a/langchain/package.json b/langchain/package.json
@@ -1,6 +1,6 @@
 {
   "name": "langchain",
-  "version": "0.3.7",
+  "version": "0.3.8",
   "description": "Typescript bindings for langchain",
   "type": "module",
   "engines": {

diff --git a/langchain/src/chains/combine_documents/base.ts b/langchain/src/chains/combine_documents/base.ts
@@ -21,6 +21,9 @@ export async function formatDocuments({
   documents: Document[];
   config?: RunnableConfig;
 }) {
+  if (documents == null || documents.length === 0) {
+    return "";
+  }
   const formattedDocs = await Promise.all(
     documents.map((document) =>
       documentPrompt

diff --git a/libs/langchain-cohere/package.json b/libs/langchain-cohere/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langchain/cohere",
-  "version": "0.3.1",
+  "version": "0.3.2",
   "description": "Cohere integration for LangChain.js",
   "type": "module",
   "engines": {

diff --git a/libs/langchain-cohere/src/rerank.ts b/libs/langchain-cohere/src/rerank.ts
@@ -60,6 +60,9 @@ export class CohereRerank extends BaseDocumentCompressor {
     documents: Array<DocumentInterface>,
     query: string
   ): Promise<Array<DocumentInterface>> {
+    if (documents == null || documents.length === 0) {
+      return [];
+    }
     const _docs = documents.map((doc) => doc.pageContent);
     const { results } = await this.client.rerank({
       model: this.model,

diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore
@@ -626,6 +626,10 @@ retrievers/amazon_knowledge_base.cjs
 retrievers/amazon_knowledge_base.js
 retrievers/amazon_knowledge_base.d.ts
 retrievers/amazon_knowledge_base.d.cts
+retrievers/arxiv.cjs
+retrievers/arxiv.js
+retrievers/arxiv.d.ts
+retrievers/arxiv.d.cts
 retrievers/bm25.cjs
 retrievers/bm25.js
 retrievers/bm25.d.ts

diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
@@ -198,6 +198,7 @@ export const config = {
     // retrievers
     "retrievers/amazon_kendra": "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv": "retrievers/arxiv",
     "retrievers/bm25": "retrievers/bm25",
     "retrievers/chaindesk": "retrievers/chaindesk",
     "retrievers/databerry": "retrievers/databerry",
@@ -437,6 +438,7 @@ export const config = {
     "chat_models/zhipuai",
     "retrievers/amazon_kendra",
     "retrievers/amazon_knowledge_base",
+    "retrievers/arxiv",
     "retrievers/dria",
     "retrievers/metal",
     "retrievers/supabase",

diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@langchain/community",
-  "version": "0.3.19",
+  "version": "0.3.20",
   "description": "Third-party integrations for LangChain.js",
   "type": "module",
   "engines": {
@@ -166,6 +166,7 @@
     "eslint-plugin-no-instanceof": "^1.0.1",
     "eslint-plugin-prettier": "^4.2.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "^4.5.1",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "^9.10.0",
     "googleapis": "^126.0.1",
@@ -302,6 +303,7 @@
     "duck-duck-scrape": "^2.2.5",
     "epub2": "^3.0.1",
     "faiss-node": "^0.5.1",
+    "fast-xml-parser": "*",
     "firebase-admin": "^11.9.0 || ^12.0.0",
     "google-auth-library": "*",
     "googleapis": "*",
@@ -584,6 +586,9 @@
     "faiss-node": {
       "optional": true
     },
+    "fast-xml-parser": {
+      "optional": true
+    },
     "firebase-admin": {
       "optional": true
     },
@@ -2125,6 +2130,15 @@
       "import": "./retrievers/amazon_knowledge_base.js",
       "require": "./retrievers/amazon_knowledge_base.cjs"
     },
+    "./retrievers/arxiv": {
+      "types": {
+        "import": "./retrievers/arxiv.d.ts",
+        "require": "./retrievers/arxiv.d.cts",
+        "default": "./retrievers/arxiv.d.ts"
+      },
+      "import": "./retrievers/arxiv.js",
+      "require": "./retrievers/arxiv.cjs"
+    },
     "./retrievers/bm25": {
       "types": {
         "import": "./retrievers/bm25.d.ts",
@@ -3774,6 +3788,10 @@
     "retrievers/amazon_knowledge_base.js",
     "retrievers/amazon_knowledge_base.d.ts",
     "retrievers/amazon_knowledge_base.d.cts",
+    "retrievers/arxiv.cjs",
+    "retrievers/arxiv.js",
+    "retrievers/arxiv.d.ts",
+    "retrievers/arxiv.d.cts",
     "retrievers/bm25.cjs",
     "retrievers/bm25.js",
     "retrievers/bm25.d.ts",

diff --git a/libs/langchain-community/src/caches/upstash_redis.ts b/libs/langchain-community/src/caches/upstash_redis.ts
@@ -18,6 +18,10 @@ export type UpstashRedisCacheProps = {
    * An existing Upstash Redis client.
    */
   client?: Redis;
+  /**
+   * Time-to-live (TTL) for cached items in seconds.
+   */
+  ttl?: number;
 };
 
 /**
@@ -30,6 +34,7 @@ export type UpstashRedisCacheProps = {
  *     url: "UPSTASH_REDIS_REST_URL",
  *     token: "UPSTASH_REDIS_REST_TOKEN",
  *   },
+ *   ttl: 3600, // Optional: Cache entries will expire after 1 hour
  * });
  * // Initialize the OpenAI model with Upstash Redis cache for caching responses
  * const model = new ChatOpenAI({
@@ -42,9 +47,12 @@ export type UpstashRedisCacheProps = {
 export class UpstashRedisCache extends BaseCache {
   private redisClient: Redis;
 
+  private ttl?: number;
+
   constructor(props: UpstashRedisCacheProps) {
     super();
-    const { config, client } = props;
+    const { config, client, ttl } = props;
+    this.ttl = ttl;
 
     if (client) {
       this.redisClient = client;
@@ -84,10 +92,13 @@ export class UpstashRedisCache extends BaseCache {
   public async update(prompt: string, llmKey: string, value: Generation[]) {
     for (let i = 0; i < value.length; i += 1) {
       const key = getCacheKey(prompt, llmKey, String(i));
-      await this.redisClient.set(
-        key,
-        JSON.stringify(serializeGeneration(value[i]))
-      );
+      const serializedValue = JSON.stringify(serializeGeneration(value[i]));
+
+      if (this.ttl) {
+        await this.redisClient.set(key, serializedValue, { ex: this.ttl });
+      } else {
+        await this.redisClient.set(key, serializedValue);
+      }
     }
   }
 }