Skip to content

Commit

Permalink
x
Browse files Browse the repository at this point in the history
  • Loading branch information
efriis committed Apr 23, 2024
1 parent cef2afc commit 6bc153e
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions libs/partners/mistralai/langchain_mistralai/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ def validate_environment(cls, values: Dict) -> Dict:
)
except IOError: # huggingface_hub GatedRepoError
warnings.warn(
"Using dummy tokenizer, set a Huggingface token via the "
"HF_TOKEN environment variable to use a real tokenizer."
"Could not download mistral tokenizer from Huggingface for "
"calculating batch sizes. Set a Huggingface token via the"
"HF_TOKEN environment variable to download the real tokenizer. "
"Falling back to a dummy tokenizer that uses `len()`."
)
values["tokenizer"] = DummyTokenizer()
return values
Expand All @@ -115,7 +117,10 @@ def _get_batches(self, texts: List[str]) -> Iterable[List[str]]:

for text, text_tokens in zip(texts, text_token_lengths):
if batch_tokens + text_tokens > MAX_TOKENS:
yield batch
if len(batch) > 0:
# edge case where first batch exceeds max tokens
# should not yield an empty batch.
yield batch
batch = [text]
batch_tokens = text_tokens
else:
Expand Down

0 comments on commit 6bc153e

Please sign in to comment.