Merge branch 'master' into law_2

langchain-ai · Apr 23, 2024 · 5d86755 · 5d86755
2 parents 6469e07 + 1c89e45
commit 5d86755
Show file tree

Hide file tree

Showing 27 changed files with 313 additions and 164 deletions.
diff --git a/docs/api_reference/guide_imports.json b/docs/api_reference/guide_imports.json
diff --git a/docs/docs/guides/productionization/safety/hugging_face_prompt_injection.ipynb b/docs/docs/guides/productionization/safety/hugging_face_prompt_injection.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "This notebook shows how to prevent prompt injection attacks using the text classification model from `HuggingFace`.\n",
     "\n",
-    "By default, it uses a *[laiyer/deberta-v3-base-prompt-injection](https://huggingface.co/laiyer/deberta-v3-base-prompt-injection)* model trained to identify prompt injections. \n",
+    "By default, it uses a *[protectai/deberta-v3-base-prompt-injection-v2](https://huggingface.co/protectai/deberta-v3-base-prompt-injection-v2)* model trained to identify prompt injections. \n",
     "\n",
     "In this notebook, we will use the ONNX version of the model to speed up the inference. "
    ]
@@ -49,11 +49,15 @@
     "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
-    "# Using https://huggingface.co/laiyer/deberta-v3-base-prompt-injection\n",
-    "model_path = \"laiyer/deberta-v3-base-prompt-injection\"\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
-    "tokenizer.model_input_names = [\"input_ids\", \"attention_mask\"]  # Hack to run the model\n",
-    "model = ORTModelForSequenceClassification.from_pretrained(model_path, subfolder=\"onnx\")\n",
+    "# Using https://huggingface.co/protectai/deberta-v3-base-prompt-injection-v2\n",
+    "model_path = \"laiyer/deberta-v3-base-prompt-injection-v2\"\n",
+    "revision = None  # We recommend specifiying the revision to avoid breaking changes or supply chain attacks\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_path, revision=revision, model_input_names=[\"input_ids\", \"attention_mask\"]\n",
+    ")\n",
+    "model = ORTModelForSequenceClassification.from_pretrained(\n",
+    "    model_path, revision=revision, subfolder=\"onnx\"\n",
+    ")\n",
     "\n",
     "classifier = pipeline(\n",
     "    \"text-classification\",\n",

diff --git a/docs/docs/integrations/vectorstores/neo4jvector.ipynb b/docs/docs/integrations/vectorstores/neo4jvector.ipynb
@@ -8,7 +8,7 @@
     "\n",
     ">[Neo4j](https://neo4j.com/) is an open-source graph database with integrated support for vector similarity search\n",
     "\n",
-    "It supports:\n",
+    "It supports:\n\n",
     "- approximate nearest neighbor search\n",
     "- Euclidean similarity and cosine similarity\n",
     "- Hybrid search combining vector and keyword searches\n",

diff --git a/docs/docs/modules/data_connection/retrievers/custom_retriever.ipynb b/docs/docs/modules/data_connection/retrievers/custom_retriever.ipynb
@@ -98,7 +98,7 @@
     "    ) -> List[Document]:\n",
     "        \"\"\"Sync implementations for retriever.\"\"\"\n",
     "        matching_documents = []\n",
-    "        for document in documents:\n",
+    "        for document in self.documents:\n",
     "            if len(matching_documents) > self.k:\n",
     "                return matching_documents\n",
     "\n",

diff --git a/libs/community/langchain_community/chat_message_histories/file.py b/libs/community/langchain_community/chat_message_histories/file.py
@@ -1,45 +1,5 @@
-import json
-import logging
-from pathlib import Path
-from typing import List
+from langchain_core.chat_history import FileChatMessageHistory
 
-from langchain_core.chat_history import BaseChatMessageHistory
-from langchain_core.messages import (
-    BaseMessage,
-    messages_from_dict,
-    messages_to_dict,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class FileChatMessageHistory(BaseChatMessageHistory):
-    """
-    Chat message history that stores history in a local file.
-
-    Args:
-        file_path: path of the local file to store the messages.
-    """
-
-    def __init__(self, file_path: str):
-        self.file_path = Path(file_path)
-        if not self.file_path.exists():
-            self.file_path.touch()
-            self.file_path.write_text(json.dumps([]))
-
-    @property
-    def messages(self) -> List[BaseMessage]:  # type: ignore
-        """Retrieve the messages from the local file"""
-        items = json.loads(self.file_path.read_text())
-        messages = messages_from_dict(items)
-        return messages
-
-    def add_message(self, message: BaseMessage) -> None:
-        """Append the message to the record in the local file"""
-        messages = messages_to_dict(self.messages)
-        messages.append(messages_to_dict([message])[0])
-        self.file_path.write_text(json.dumps(messages))
-
-    def clear(self) -> None:
-        """Clear session memory from the local file"""
-        self.file_path.write_text(json.dumps([]))
+__all__ = [
+    "FileChatMessageHistory",
+]
diff --git a/libs/community/langchain_community/chat_message_histories/in_memory.py b/libs/community/langchain_community/chat_message_histories/in_memory.py
@@ -1,31 +1,5 @@
-from typing import List, Sequence
+from langchain_core.chat_history import InMemoryChatMessageHistory as ChatMessageHistory
 
-from langchain_core.chat_history import BaseChatMessageHistory
-from langchain_core.messages import BaseMessage
-from langchain_core.pydantic_v1 import BaseModel, Field
-
-
-class ChatMessageHistory(BaseChatMessageHistory, BaseModel):
-    """In memory implementation of chat message history.
-
-    Stores messages in an in memory list.
-    """
-
-    messages: List[BaseMessage] = Field(default_factory=list)
-
-    async def aget_messages(self) -> List[BaseMessage]:
-        return self.messages
-
-    def add_message(self, message: BaseMessage) -> None:
-        """Add a self-created message to the store"""
-        self.messages.append(message)
-
-    async def aadd_messages(self, messages: Sequence[BaseMessage]) -> None:
-        """Add messages to the store"""
-        self.add_messages(messages)
-
-    def clear(self) -> None:
-        self.messages = []
-
-    async def aclear(self) -> None:
-        self.clear()
+__all__ = [
+    "ChatMessageHistory",
+]
diff --git a/libs/community/langchain_community/llms/llamafile.py b/libs/community/langchain_community/llms/llamafile.py
@@ -139,6 +139,7 @@ def _param_fieldnames(self) -> List[str]:
             "streaming",
             "tags",
             "verbose",
+            "custom_get_token_ids",
         ]
         attrs = [
             k for k in get_pydantic_field_names(self.__class__) if k not in ignore_keys

diff --git a/libs/core/langchain_core/caches.py b/libs/core/langchain_core/caches.py
@@ -22,7 +22,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Sequence
+from typing import Any, Dict, Optional, Sequence, Tuple
 
 from langchain_core.outputs import Generation
 from langchain_core.runnables import run_in_executor
@@ -105,3 +105,37 @@ async def aupdate(
     async def aclear(self, **kwargs: Any) -> None:
         """Clear cache that can take additional keyword arguments."""
         return await run_in_executor(None, self.clear, **kwargs)
+
+
+class InMemoryCache(BaseCache):
+    """Cache that stores things in memory."""
+
+    def __init__(self) -> None:
+        """Initialize with empty cache."""
+        self._cache: Dict[Tuple[str, str], RETURN_VAL_TYPE] = {}
+
+    def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
+        """Look up based on prompt and llm_string."""
+        return self._cache.get((prompt, llm_string), None)
+
+    def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
+        """Update cache based on prompt and llm_string."""
+        self._cache[(prompt, llm_string)] = return_val
+
+    def clear(self, **kwargs: Any) -> None:
+        """Clear cache."""
+        self._cache = {}
+
+    async def alookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
+        """Look up based on prompt and llm_string."""
+        return self.lookup(prompt, llm_string)
+
+    async def aupdate(
+        self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE
+    ) -> None:
+        """Update cache based on prompt and llm_string."""
+        self.update(prompt, llm_string, return_val)
+
+    async def aclear(self, **kwargs: Any) -> None:
+        """Clear cache."""
+        self.clear()
diff --git a/libs/core/langchain_core/chat_history.py b/libs/core/langchain_core/chat_history.py
@@ -16,15 +16,20 @@
 """  # noqa: E501
 from __future__ import annotations
 
+import json
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import List, Sequence, Union
 
 from langchain_core.messages import (
     AIMessage,
     BaseMessage,
     HumanMessage,
     get_buffer_string,
+    messages_from_dict,
+    messages_to_dict,
 )
+from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_core.runnables import run_in_executor
 
 
@@ -184,3 +189,61 @@ async def aclear(self) -> None:
     def __str__(self) -> str:
         """Return a string representation of the chat history."""
         return get_buffer_string(self.messages)
+
+
+class InMemoryChatMessageHistory(BaseChatMessageHistory, BaseModel):
+    """In memory implementation of chat message history.
+
+    Stores messages in an in memory list.
+    """
+
+    messages: List[BaseMessage] = Field(default_factory=list)
+
+    async def aget_messages(self) -> List[BaseMessage]:
+        return self.messages
+
+    def add_message(self, message: BaseMessage) -> None:
+        """Add a self-created message to the store"""
+        self.messages.append(message)
+
+    async def aadd_messages(self, messages: Sequence[BaseMessage]) -> None:
+        """Add messages to the store"""
+        self.add_messages(messages)
+
+    def clear(self) -> None:
+        self.messages = []
+
+    async def aclear(self) -> None:
+        self.clear()
+
+
+class FileChatMessageHistory(BaseChatMessageHistory):
+    """Chat message history that stores history in a local file."""
+
+    def __init__(self, file_path: str) -> None:
+        """Initialize the file path for the chat history.
+
+        Args:
+            file_path: The path to the local file to store the chat history.
+        """
+        self.file_path = Path(file_path)
+        if not self.file_path.exists():
+            self.file_path.touch()
+            self.file_path.write_text(json.dumps([]))
+
+    @property
+    def messages(self) -> List[BaseMessage]:  # type: ignore
+        """Retrieve the messages from the local file"""
+        items = json.loads(self.file_path.read_text())
+        messages = messages_from_dict(items)
+        return messages
+
+    def add_message(self, message: BaseMessage) -> None:
+        """Append the message to the record in the local file"""
+        messages = messages_to_dict(self.messages)
+        messages.append(messages_to_dict([message])[0])
+        self.file_path.write_text(json.dumps(messages))
+
+    def clear(self) -> None:
+        """Clear session memory from the local file"""
+        self.file_path.write_text(json.dumps([]))
diff --git a/libs/core/langchain_core/language_models/base.py b/libs/core/langchain_core/language_models/base.py
@@ -5,6 +5,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Dict,
     List,
     Mapping,
@@ -97,6 +98,10 @@ class BaseLanguageModel(
     """Tags to add to the run trace."""
     metadata: Optional[Dict[str, Any]] = Field(default=None, exclude=True)
     """Metadata to add to the run trace."""
+    custom_get_token_ids: Optional[Callable[[str], List[int]]] = Field(
+        default=None, exclude=True
+    )
+    """Optional encoder to use for counting tokens."""
 
     @validator("verbose", pre=True, always=True)
     def set_verbose(cls, verbose: Optional[bool]) -> bool:
@@ -310,7 +315,10 @@ def get_token_ids(self, text: str) -> List[int]:
             A list of ids corresponding to the tokens in the text, in order they occur
                 in the text.
         """
-        return _get_token_ids_default_method(text)
+        if self.custom_get_token_ids is not None:
+            return self.custom_get_token_ids(text)
+        else:
+            return _get_token_ids_default_method(text)
 
     def get_num_tokens(self, text: str) -> int:
         """Get the number of tokens present in the text.