langchain-ai · wt3639 · Feb 6, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/libs/community/langchain_community/embeddings/__init__.py b/libs/community/langchain_community/embeddings/__init__.py
@@ -11,43 +11,44 @@
 """


 import logging
 from typing import Any

 from langchain_community.embeddings.aleph_alpha import (
    AlephAlphaAsymmetricSemanticEmbedding,
    AlephAlphaSymmetricSemanticEmbedding,
 )
 from langchain_community.embeddings.awa import AwaEmbeddings
 from langchain_community.embeddings.azure_openai import AzureOpenAIEmbeddings
 from langchain_community.embeddings.baichuan import BaichuanTextEmbeddings
 from langchain_community.embeddings.baidu_qianfan_endpoint import (
    QianfanEmbeddingsEndpoint,
 )
 from langchain_community.embeddings.bedrock import BedrockEmbeddings
 from langchain_community.embeddings.bookend import BookendEmbeddings
 from langchain_community.embeddings.clarifai import ClarifaiEmbeddings
 from langchain_community.embeddings.cohere import CohereEmbeddings
 from langchain_community.embeddings.dashscope import DashScopeEmbeddings
 from langchain_community.embeddings.databricks import DatabricksEmbeddings
 from langchain_community.embeddings.deepinfra import DeepInfraEmbeddings
 from langchain_community.embeddings.edenai import EdenAiEmbeddings
 from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings
 from langchain_community.embeddings.embaas import EmbaasEmbeddings
 from langchain_community.embeddings.ernie import ErnieEmbeddings
 from langchain_community.embeddings.fake import (
    DeterministicFakeEmbedding,
    FakeEmbeddings,
 )
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from langchain_community.embeddings.google_palm import GooglePalmEmbeddings
 from langchain_community.embeddings.gpt4all import GPT4AllEmbeddings
 from langchain_community.embeddings.gradient_ai import GradientEmbeddings
 from langchain_community.embeddings.huggingface import (
    HuggingFaceBgeEmbeddings,
     HuggingFaceEmbeddings,
     HuggingFaceInferenceAPIEmbeddings,
     HuggingFaceInstructEmbeddings,
+    HuggingFaceNomicEmbeddings
 )
 from langchain_community.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain_community.embeddings.infinity import InfinityEmbeddings

diff --git a/libs/community/langchain_community/embeddings/huggingface.py b/libs/community/langchain_community/embeddings/huggingface.py
@@ -7,6 +7,7 @@
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
 DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
 DEFAULT_BGE_MODEL = "BAAI/bge-large-en"
+DEFAULT_NOMIC_MODEL = 'nomic-ai/nomic-embed-text-v1'
 DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
 DEFAULT_QUERY_INSTRUCTION = (
     "Represent the question for retrieving supporting documents: "
@@ -15,7 +16,8 @@
     "Represent this question for searching relevant passages: "
 )
 DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章："
-
+DEFAULT_QUERY_NOMIC_INSTRUCTION = 'search_query:'
+DEFAULT_EMBED_NOMICI_NSTRUCTION = 'search_document:'
 
 class HuggingFaceEmbeddings(BaseModel, Embeddings):
     """HuggingFace sentence_transformers embedding models.
@@ -345,3 +347,92 @@
             Embeddings for the text.
         """
         return self.embed_documents([text])[0]
+
+
+
+class HuggingFaceNomicEmbeddings(BaseModel, Embeddings):
+    """HuggingFace Nomic sentence_transformers embedding models.
+
+   To use this, you should have the sentence_transformers Python package installed, and the version needs to be >= 2.3.0.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_community.embeddings import HuggingFaceNomicEmbeddings
+
+            model_name = "nomic-ai/nomic-embed-text-v1"
+            model_kwargs = {
+                'device': 'cpu',
+                'trust_remote_code':True 
+                }
+            encode_kwargs = {'normalize_embeddings': True}
+            hf = HuggingFaceNomicEmbeddings(
+                model_name=model_name,
+                model_kwargs=model_kwargs,
+                encode_kwargs=encode_kwargs
+            )
+    """
+
+    client: Any  #: :meta private:
+    model_name: str = DEFAULT_NOMIC_MODEL
+    """Model name to use."""
+    cache_folder: Optional[str] = None
+    """Path to store models.
+    Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
+    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Keyword arguments to pass to the model."""
+    encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Keyword arguments to pass when calling the `encode` method of the model."""
+    query_instruction: str = DEFAULT_QUERY_NOMIC_INSTRUCTION
+    """Instruction to use for embedding query."""
+    embed_instruction: str = DEFAULT_EMBED_NOMICI_NSTRUCTION
+
+    def __init__(self, **kwargs: Any):
+        """Initialize the sentence_transformer."""
+        super().__init__(**kwargs)
+        try:
+            import sentence_transformers
+
+        except ImportError as exc:
+            raise ImportError(
+                "Could not import sentence_transformers python package. "
+                "Please install it with `pip install sentence_transformers`."
+            ) from exc
+
+        self.client = sentence_transformers.SentenceTransformer(
+            self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
+        )
+
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Compute doc embeddings using a HuggingFace transformer model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        texts = [self.embed_instruction + t.replace("\n", " ") for t in texts]
+        embeddings = self.client.encode(texts, **self.encode_kwargs)
+        return embeddings.tolist()
+
+    def embed_query(self, text: str) -> List[float]:
+        """Compute query embeddings using a HuggingFace transformer model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        text = text.replace("\n", " ")
+        embedding = self.client.encode(
+            self.query_instruction + text, **self.encode_kwargs
+        )
+        return embedding.tolist()
diff --git a/libs/community/tests/unit_tests/embeddings/test_imports.py b/libs/community/tests/unit_tests/embeddings/test_imports.py
@@ -58,6 +58,7 @@
     "BookendEmbeddings",
     "VolcanoEmbeddings",
     "OCIGenAIEmbeddings",
+    "HuggingFaceNomicEmbeddings",
 ]