Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: support Nomic embeddings #17138

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libs/community/langchain_community/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,44 @@
"""


import logging
from typing import Any

from langchain_community.embeddings.aleph_alpha import (
AlephAlphaAsymmetricSemanticEmbedding,
AlephAlphaSymmetricSemanticEmbedding,
)
from langchain_community.embeddings.awa import AwaEmbeddings
from langchain_community.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain_community.embeddings.baichuan import BaichuanTextEmbeddings
from langchain_community.embeddings.baidu_qianfan_endpoint import (
QianfanEmbeddingsEndpoint,
)
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.bookend import BookendEmbeddings
from langchain_community.embeddings.clarifai import ClarifaiEmbeddings
from langchain_community.embeddings.cohere import CohereEmbeddings
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_community.embeddings.databricks import DatabricksEmbeddings
from langchain_community.embeddings.deepinfra import DeepInfraEmbeddings
from langchain_community.embeddings.edenai import EdenAiEmbeddings
from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings
from langchain_community.embeddings.embaas import EmbaasEmbeddings
from langchain_community.embeddings.ernie import ErnieEmbeddings
from langchain_community.embeddings.fake import (
DeterministicFakeEmbedding,
FakeEmbeddings,
)
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.embeddings.google_palm import GooglePalmEmbeddings
from langchain_community.embeddings.gpt4all import GPT4AllEmbeddings
from langchain_community.embeddings.gradient_ai import GradientEmbeddings
from langchain_community.embeddings.huggingface import (
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInferenceAPIEmbeddings,
HuggingFaceInstructEmbeddings,
HuggingFaceNomicEmbeddings

Check failure on line 51 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.11

Ruff (F401)

langchain_community/embeddings/__init__.py:51:5: F401 `langchain_community.embeddings.huggingface.HuggingFaceNomicEmbeddings` imported but unused
)
from langchain_community.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from langchain_community.embeddings.infinity import InfinityEmbeddings
Expand Down
93 changes: 92 additions & 1 deletion libs/community/langchain_community/embeddings/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
DEFAULT_BGE_MODEL = "BAAI/bge-large-en"
DEFAULT_NOMIC_MODEL = 'nomic-ai/nomic-embed-text-v1'
DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
DEFAULT_QUERY_INSTRUCTION = (
"Represent the question for retrieving supporting documents: "
Expand All @@ -15,7 +16,8 @@
"Represent this question for searching relevant passages: "
)
DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:"

DEFAULT_QUERY_NOMIC_INSTRUCTION = 'search_query:'
DEFAULT_EMBED_NOMICI_NSTRUCTION = 'search_document:'

class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
Expand Down Expand Up @@ -345,3 +347,92 @@
Embeddings for the text.
"""
return self.embed_documents([text])[0]



class HuggingFaceNomicEmbeddings(BaseModel, Embeddings):
"""HuggingFace Nomic sentence_transformers embedding models.

To use this, you should have the sentence_transformers Python package installed, and the version needs to be >= 2.3.0.

Check failure on line 356 in libs/community/langchain_community/embeddings/huggingface.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.11

Ruff (E501)

langchain_community/embeddings/huggingface.py:356:89: E501 Line too long (121 > 88)

Example:
.. code-block:: python

from langchain_community.embeddings import HuggingFaceNomicEmbeddings

model_name = "nomic-ai/nomic-embed-text-v1"
model_kwargs = {
'device': 'cpu',
'trust_remote_code':True
}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceNomicEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
"""

client: Any #: :meta private:
model_name: str = DEFAULT_NOMIC_MODEL
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the model."""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the model."""
query_instruction: str = DEFAULT_QUERY_NOMIC_INSTRUCTION
"""Instruction to use for embedding query."""
embed_instruction: str = DEFAULT_EMBED_NOMICI_NSTRUCTION

def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
import sentence_transformers

except ImportError as exc:
raise ImportError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence_transformers`."
) from exc

self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)


class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.

Args:
texts: The list of texts to embed.

Returns:
List of embeddings, one for each text.
"""
texts = [self.embed_instruction + t.replace("\n", " ") for t in texts]
embeddings = self.client.encode(texts, **self.encode_kwargs)
return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.

Args:
text: The text to embed.

Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ")
embedding = self.client.encode(
self.query_instruction + text, **self.encode_kwargs
)
return embedding.tolist()
1 change: 1 addition & 0 deletions libs/community/tests/unit_tests/embeddings/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"BookendEmbeddings",
"VolcanoEmbeddings",
"OCIGenAIEmbeddings",
"HuggingFaceNomicEmbeddings",
]


Expand Down
Loading