From a826399b1158341f18da35ae2f357038b94b81d6 Mon Sep 17 00:00:00 2001 From: CaraJ7 <1350074492@qq.com> Date: Wed, 28 Aug 2024 09:12:15 +0800 Subject: [PATCH] Support textonly inference for LLaVA-OneVision. --- lmms_eval/models/llava_onevision.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lmms_eval/models/llava_onevision.py b/lmms_eval/models/llava_onevision.py index 3e768225..064d40d7 100644 --- a/lmms_eval/models/llava_onevision.py +++ b/lmms_eval/models/llava_onevision.py @@ -453,6 +453,10 @@ def _collate(x): if len(visual) > 1 or "image_aspect_ratio" not in self._config.__dict__: # for multi image case, we treat per image aspect ratio as "pad" by default. self._config.image_aspect_ratio = getattr(gen_kwargs, "image_aspect_ratio", "pad") eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}") + + if len(visual) == 0: # For textonly task + image_tensor = None + task_type = 'textonly' if type(visual[0]) == PIL.Image.Image and "task_type" not in metadata and "sample_frames" not in metadata: # For image task image_tensor = process_images(visual, self._image_processor, self._config)