diff --git a/eland/ml/pytorch/transformers.py b/eland/ml/pytorch/transformers.py index fb1bb696..40e5650b 100644 --- a/eland/ml/pytorch/transformers.py +++ b/eland/ml/pytorch/transformers.py @@ -570,7 +570,7 @@ class _TraceableTextEmbeddingModel(_TransformerTraceableModel): def _prepare_inputs(self) -> transformers.BatchEncoding: return self._tokenizer( "This is an example sentence.", - padding="max_length", + padding="longest", return_tensors="pt", ) @@ -759,7 +759,7 @@ def _find_max_sequence_length(self) -> int: # a random or very large value. REASONABLE_MAX_LENGTH = 8192 max_len = getattr(self._tokenizer, "model_max_length", None) - if max_len is not None and max_len < REASONABLE_MAX_LENGTH: + if max_len is not None and max_len <= REASONABLE_MAX_LENGTH: return int(max_len) max_sizes = getattr(self._tokenizer, "max_model_input_sizes", dict()) diff --git a/setup.py b/setup.py index 2ad02ffb..4ac65d1d 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ "sentence-transformers>=2.1.0,<=2.7.0", # sentencepiece is a required dependency for the slow tokenizers # https://huggingface.co/transformers/v4.4.2/migration.html#sentencepiece-is-removed-from-the-required-dependencies - "transformers[sentencepiece]>=4.31.0,<4.44.0", + "transformers[sentencepiece]>=4.47.0", ], } extras["all"] = list({dep for deps in extras.values() for dep in deps})