From f0198354d93e7ba8b615b8fd845223c88ea4ed2b Mon Sep 17 00:00:00 2001 From: Massimiliano Pronesti Date: Wed, 20 Sep 2023 06:50:39 +0200 Subject: [PATCH] fix(embeddings): number of texts in Azure OpenAIEmbeddings batch (#10707) This PR addresses the limitation of Azure OpenAI embeddings, which can handle at maximum 16 texts in a batch. This can be solved setting `chunk_size=16`. However, I'd love to have this automated, not to force the user to figure where the issue comes from and how to solve it. Closes #4575. @baskaryan --------- Co-authored-by: Harrison Chase --- libs/langchain/langchain/embeddings/openai.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/embeddings/openai.py b/libs/langchain/langchain/embeddings/openai.py index 16e7e0f306a7b..274788a1a8c38 100644 --- a/libs/langchain/langchain/embeddings/openai.py +++ b/libs/langchain/langchain/embeddings/openai.py @@ -231,7 +231,7 @@ def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]: values["model_kwargs"] = extra return values - @root_validator() + @root_validator(pre=True) def validate_environment(cls, values: Dict) -> Dict: """Validate that api key and python package exists in environment.""" values["openai_api_key"] = get_from_dict_or_env( @@ -257,8 +257,13 @@ def validate_environment(cls, values: Dict) -> Dict: ) if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): default_api_version = "2022-12-01" + # Azure OpenAI embedding models allow a maximum of 16 texts + # at a time in each batch + # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings + default_chunk_size = 16 else: default_api_version = "" + default_chunk_size = 1000 values["openai_api_version"] = get_from_dict_or_env( values, "openai_api_version", @@ -271,6 +276,8 @@ def validate_environment(cls, values: Dict) -> Dict: "OPENAI_ORGANIZATION", default="", ) + if "chunk_size" not in values: + values["chunk_size"] = default_chunk_size try: import openai