From 0f0c4f45cfd87f9e4bfbbba77ddb398f32738a25 Mon Sep 17 00:00:00 2001 From: hh-space-invader Date: Mon, 25 Nov 2024 07:04:52 +0200 Subject: [PATCH] fix: Fix colbert model shape mismatch --- fastembed/late_interaction/colbert.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 4d65fc29..83221c19 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -102,6 +102,9 @@ def _tokenize_query(self, query: str) -> list[Encoding]: return encoded def _tokenize_documents(self, documents: list[str]) -> list[Encoding]: + current_max_length = self.tokenizer.truncation["max_length"] + # ensure not to overflow after adding document-marker + self.tokenizer.enable_truncation(max_length=current_max_length - 1) encoded = self.tokenizer.encode_batch(documents) return encoded