Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: support Nomic embeddings #17138

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/community/langchain_community/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,84 +11,85 @@
"""


import logging
from typing import Any

from langchain_community.embeddings.aleph_alpha import (
AlephAlphaAsymmetricSemanticEmbedding,
AlephAlphaSymmetricSemanticEmbedding,
)
from langchain_community.embeddings.awa import AwaEmbeddings
from langchain_community.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain_community.embeddings.baichuan import BaichuanTextEmbeddings
from langchain_community.embeddings.baidu_qianfan_endpoint import (
QianfanEmbeddingsEndpoint,
)
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.bookend import BookendEmbeddings
from langchain_community.embeddings.clarifai import ClarifaiEmbeddings
from langchain_community.embeddings.cohere import CohereEmbeddings
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_community.embeddings.databricks import DatabricksEmbeddings
from langchain_community.embeddings.deepinfra import DeepInfraEmbeddings
from langchain_community.embeddings.edenai import EdenAiEmbeddings
from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings
from langchain_community.embeddings.embaas import EmbaasEmbeddings
from langchain_community.embeddings.ernie import ErnieEmbeddings
from langchain_community.embeddings.fake import (
DeterministicFakeEmbedding,
FakeEmbeddings,
)
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.embeddings.google_palm import GooglePalmEmbeddings
from langchain_community.embeddings.gpt4all import GPT4AllEmbeddings
from langchain_community.embeddings.gradient_ai import GradientEmbeddings
from langchain_community.embeddings.huggingface import (
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInferenceAPIEmbeddings,
HuggingFaceInstructEmbeddings,
HuggingFaceNomicEmbeddings
)
from langchain_community.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from langchain_community.embeddings.infinity import InfinityEmbeddings
from langchain_community.embeddings.javelin_ai_gateway import JavelinAIGatewayEmbeddings
from langchain_community.embeddings.jina import JinaEmbeddings
from langchain_community.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings
from langchain_community.embeddings.llamacpp import LlamaCppEmbeddings
from langchain_community.embeddings.llm_rails import LLMRailsEmbeddings
from langchain_community.embeddings.localai import LocalAIEmbeddings
from langchain_community.embeddings.minimax import MiniMaxEmbeddings
from langchain_community.embeddings.mlflow import (
MlflowCohereEmbeddings,
MlflowEmbeddings,
)
from langchain_community.embeddings.mlflow_gateway import MlflowAIGatewayEmbeddings
from langchain_community.embeddings.modelscope_hub import ModelScopeEmbeddings
from langchain_community.embeddings.mosaicml import MosaicMLInstructorEmbeddings
from langchain_community.embeddings.nlpcloud import NLPCloudEmbeddings
from langchain_community.embeddings.oci_generative_ai import OCIGenAIEmbeddings
from langchain_community.embeddings.octoai_embeddings import OctoAIEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings.sagemaker_endpoint import (
SagemakerEndpointEmbeddings,
)
from langchain_community.embeddings.self_hosted import SelfHostedEmbeddings
from langchain_community.embeddings.self_hosted_hugging_face import (
SelfHostedHuggingFaceEmbeddings,
SelfHostedHuggingFaceInstructEmbeddings,
)
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.embeddings.tensorflow_hub import TensorflowHubEmbeddings
from langchain_community.embeddings.vertexai import VertexAIEmbeddings
from langchain_community.embeddings.volcengine import VolcanoEmbeddings
from langchain_community.embeddings.voyageai import VoyageEmbeddings
from langchain_community.embeddings.xinference import XinferenceEmbeddings

logger = logging.getLogger(__name__)

Check failure on line 92 in libs/community/langchain_community/embeddings/__init__.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.11

Ruff (I001)

langchain_community/embeddings/__init__.py:14:1: I001 Import block is un-sorted or un-formatted

__all__ = [
"OpenAIEmbeddings",
Expand Down Expand Up @@ -139,6 +140,7 @@
"LocalAIEmbeddings",
"AwaEmbeddings",
"HuggingFaceBgeEmbeddings",
'HuggingFaceNomicEmbeddings',
"ErnieEmbeddings",
"JavelinAIGatewayEmbeddings",
"OllamaEmbeddings",
Expand Down
93 changes: 92 additions & 1 deletion libs/community/langchain_community/embeddings/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
DEFAULT_BGE_MODEL = "BAAI/bge-large-en"
DEFAULT_NOMIC_MODEL = 'nomic-ai/nomic-embed-text-v1'
DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
DEFAULT_QUERY_INSTRUCTION = (
"Represent the question for retrieving supporting documents: "
Expand All @@ -15,7 +16,8 @@
"Represent this question for searching relevant passages: "
)
DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:"

DEFAULT_QUERY_NOMIC_INSTRUCTION = 'search_query:'
DEFAULT_EMBED_NOMICI_NSTRUCTION = 'search_document:'

class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
Expand Down Expand Up @@ -345,3 +347,92 @@
Embeddings for the text.
"""
return self.embed_documents([text])[0]



class HuggingFaceNomicEmbeddings(BaseModel, Embeddings):
"""HuggingFace Nomic sentence_transformers embedding models.

To use this, you should have the sentence_transformers Python package installed, and the version needs to be >= 2.3.0.

Check failure on line 356 in libs/community/langchain_community/embeddings/huggingface.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.11

Ruff (E501)

langchain_community/embeddings/huggingface.py:356:89: E501 Line too long (121 > 88)

Example:
.. code-block:: python

from langchain_community.embeddings import HuggingFaceNomicEmbeddings

model_name = "nomic-ai/nomic-embed-text-v1"
model_kwargs = {
'device': 'cpu',
'trust_remote_code':True
}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceNomicEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
"""

client: Any #: :meta private:
model_name: str = DEFAULT_NOMIC_MODEL
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the model."""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the model."""
query_instruction: str = DEFAULT_QUERY_NOMIC_INSTRUCTION
"""Instruction to use for embedding query."""
embed_instruction: str = DEFAULT_EMBED_NOMICI_NSTRUCTION

def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
import sentence_transformers

except ImportError as exc:
raise ImportError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence_transformers`."
) from exc

self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)


class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid

def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.

Args:
texts: The list of texts to embed.

Returns:
List of embeddings, one for each text.
"""
texts = [self.embed_instruction + t.replace("\n", " ") for t in texts]
embeddings = self.client.encode(texts, **self.encode_kwargs)
return embeddings.tolist()

def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.

Args:
text: The text to embed.

Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ")
embedding = self.client.encode(
self.query_instruction + text, **self.encode_kwargs
)
return embedding.tolist()
1 change: 1 addition & 0 deletions libs/community/tests/unit_tests/embeddings/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"BookendEmbeddings",
"VolcanoEmbeddings",
"OCIGenAIEmbeddings",
"HuggingFaceNomicEmbeddings",
]


Expand Down
Loading