From d1273b4d5353cd973d9027055315d32c45bad335 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 22 Nov 2024 13:14:44 +0400 Subject: [PATCH] fix pil import in VLM --- .../openvino/modeling_visual_language.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index a1b531a1f..7573ddcef 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -4,7 +4,7 @@ import warnings from abc import abstractmethod from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union, TYPE_CHECKING import numpy as np import openvino as ov @@ -12,7 +12,6 @@ from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation -from PIL.Image import Image from transformers import ( AutoConfig, AutoImageProcessor, @@ -50,6 +49,10 @@ LlavaNextForConditionalGeneration = None +if TYPE_CHECKING: + from PIL import Image + + logger = logging.getLogger(__name__) core = ov.Core() @@ -790,7 +793,7 @@ def can_generate(self): @abstractmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None, @@ -967,7 +970,7 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values): @staticmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None, @@ -1287,7 +1290,7 @@ def merge_vision_text_embeddings( @staticmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None, @@ -1662,7 +1665,7 @@ def merge_vision_text_embeddings( @staticmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None, @@ -1857,7 +1860,7 @@ def get_multimodal_embeddings( @staticmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None, @@ -2017,7 +2020,7 @@ def get_multimodal_embeddings( @staticmethod def preprocess_inputs( text: str, - image: Optional[Image] = None, + image: Optional["Image"] = None, processor: Optional[AutoImageProcessor] = None, tokenizer: Optional[PreTrainedTokenizer] = None, config: Optional[PretrainedConfig] = None,