Skip to content

Commit

Permalink
fix config saving when check on misplaced args broken (#966)
Browse files Browse the repository at this point in the history
* fix config saving when check on misplaced args broken

* add internvl test

* fix tests

* fix tests

* numeric stability in tests

* fix code style

* update and reuse preprocess_inputs

* Update optimum/exporters/openvino/utils.py

Co-authored-by: Nikita Savelyev <[email protected]>

* Update tests/openvino/test_modeling.py

Co-authored-by: Nikita Savelyev <[email protected]>

* change preprocess_inputs signature

* fix quantization after signature update

* fix preparing generation config

* Update optimum/intel/openvino/modeling_visual_language.py

Co-authored-by: Nikita Savelyev <[email protected]>

---------

Co-authored-by: Nikita Savelyev <[email protected]>
  • Loading branch information
eaidova and nikita-savelyevv authored Nov 13, 2024
1 parent 0447ae2 commit 5c879b9
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 66 deletions.
9 changes: 7 additions & 2 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
_get_open_clip_submodels_fn_and_export_configs,
clear_class_registry,
remove_none_from_dummy_inputs,
save_config,
)


Expand Down Expand Up @@ -684,7 +685,11 @@ def export_from_model(
files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand All @@ -696,7 +701,7 @@ def export_from_model(
setattr(model.config, param_name, None)

# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
save_config(model.config, output)
generation_config = getattr(model, "generation_config", None)
if generation_config is not None:
try:
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,7 +1464,7 @@ class InternVLChatConfigBehavior(str, enum.Enum):
@register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers")
class InternVLChatOpenVINOConfig(OnnxConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior]
NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)

def __init__(
Expand Down
18 changes: 18 additions & 0 deletions optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# limitations under the License.

import inspect
import logging
from collections import namedtuple
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from transformers.utils import is_torch_available
Expand All @@ -25,6 +27,9 @@
from optimum.utils import is_diffusers_available


logger = logging.getLogger(__name__)


InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"])


Expand Down Expand Up @@ -209,3 +214,16 @@ def get_submodels(model):


MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]


def save_config(config, save_dir):
try:
config.save_pretrained(save_dir)
except Exception as exp:
logger.warning(
f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage."
)
save_dir = Path(save_dir)
save_dir.mkdir(exist_ok=True, parents=True)
output_config_file = Path(save_dir / "config.json")
config.to_json_file(output_config_file, use_diff=True)
6 changes: 5 additions & 1 deletion optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,11 @@ def __init__(
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand Down
6 changes: 5 additions & 1 deletion optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,11 @@ def __init__(
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
# some model configs may have issues with loading without parameters initialization
try:
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
except KeyError:
misplaced_generation_parameters = {}
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
Expand Down
215 changes: 200 additions & 15 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from PIL.Image import Image
from transformers import (
AutoConfig,
AutoImageProcessor,
GenerationConfig,
GenerationMixin,
PretrainedConfig,
Expand All @@ -24,6 +25,7 @@

from ...exporters.openvino import main_export
from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
from ...exporters.openvino.utils import save_config
from .. import OVQuantizer
from .configuration import OVConfig, OVWeightQuantizationConfig
from .modeling_base import OVBaseModel, OVModelPart
Expand Down Expand Up @@ -319,6 +321,13 @@ def compile(self):
if part_model is not None:
part_model._compile()

def _save_config(self, save_directory):
"""
Saves a model configuration into a directory, so that it can be re-loaded using the
[`from_pretrained`] class method.
"""
save_config(self.config, save_directory)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Saves the model to the OpenVINO IR format so that it can be re-loaded using the
Expand Down Expand Up @@ -728,9 +737,9 @@ def can_generate(self):
@staticmethod
@abstractmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
"""
Expand Down Expand Up @@ -902,15 +911,23 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if image is None:
raise ValueError("Image is required.")
chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}]
prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
if processor is None:
raise ValueError("Processor is required.")
if getattr(processor, "chat_template", None) is not None:
chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
if image is not None:
chat_prompt[0]["content"].append({"type": "image"})
prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
else:
if image is not None and "<image>" not in text:
prompt = "<image>\n" + text
else:
prompt = text
inputs = processor(images=image, text=prompt, return_tensors="pt")
return inputs

Expand Down Expand Up @@ -1209,6 +1226,159 @@ def merge_vision_text_embeddings(
input_embeds = input_embeds.reshape(B, N, C)
return input_embeds, attention_mask, position_ids

def preprocess_inputs(
self,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if tokenizer is None:
raise ValueError("Tokenizer is required.")
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

IMG_START_TOKEN = "<img>"
IMG_END_TOKEN = "</img>"
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height

# calculate the existing image aspect ratio
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
}
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)

# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images

def load_image(image, input_size=448, max_num=12):
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values

if image is not None:
if "<image>" not in text:
text = "<image>\n" + text
pixel_values = load_image(image, input_size=self.config.vision_config.image_size)
num_patches = pixel_values.shape[0]
num_image_token = int(
(self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2
* (self.config.downsample_ratio**2)
)
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
text = text.replace("<image>", image_tokens, 1)
text_inputs = tokenizer(text, return_tensors="pt")
inputs = dict(text_inputs)
inputs.update({"pixel_values": pixel_values})
else:
inputs = tokenizer(text, return_tensors="pt")
return inputs

# internvl has issue with check _get_non_default_parameters, as wrkaraund overide _prepare_generation_config
def _prepare_generation_config(
self, generation_config: Optional[GenerationConfig], **kwargs: Dict
) -> Tuple[GenerationConfig, Dict]:
using_model_generation_config = False
if generation_config is None:
if (
self.generation_config._from_model_config # 1)
and self.generation_config._original_object_hash == hash(self.generation_config) # 2)
):
new_generation_config = GenerationConfig.from_model_config(self.config)
if new_generation_config != self.generation_config: # 4)
warnings.warn(
"You have modified the pretrained model configuration to control generation. This is a"
" deprecated strategy to control generation and will be removed in v5."
" Please use and modify the model generation configuration (see"
" https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
UserWarning,
)
self.generation_config = new_generation_config

generation_config = self.generation_config
using_model_generation_config = True

generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs)
# If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
if not using_model_generation_config:
if generation_config.bos_token_id is None:
generation_config.bos_token_id = self.generation_config.bos_token_id
if generation_config.eos_token_id is None:
generation_config.eos_token_id = self.generation_config.eos_token_id
if generation_config.pad_token_id is None:
generation_config.pad_token_id = self.generation_config.pad_token_id
if generation_config.decoder_start_token_id is None:
generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id

return generation_config, model_kwargs


class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM):
additional_parts = ["resampler"]
Expand Down Expand Up @@ -1430,14 +1600,22 @@ def merge_vision_text_embeddings(

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if image is None:
raise ValueError("Image is required.")
prompt = f"<|im_start|>user\n(<image>./</image>)\n{text}<|im_end|>\n<|im_start|>assistant\n"
if processor is None:
raise ValueError("Processor is required.")
if getattr(processor, "chat_template", None) is not None:
messages = [{"role": "user", "content": text if image is None else "(<image>./</image>)\n" + text}]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
prompt = (
f"<|im_start|>user\n(<image>./</image>)\n{text}<|im_end|>\n<|im_start|>assistant\n"
if image is not None
else text
)
inputs = processor([prompt], [image], return_tensors="pt")
return inputs

Expand Down Expand Up @@ -1615,17 +1793,24 @@ def get_multimodal_embeddings(

@staticmethod
def preprocess_inputs(
processor,
text: str,
image: Optional[Image] = None,
processor: Optional[AutoImageProcessor] = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
):
if tokenizer is None:
raise ValueError("Tokenizer is required.")
messages = [{"role": "user", "content": f"<image>\n{text}"}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
if image is not None and processor is None:
raise ValueError("Processor is required.")
text_content = f"<image>\n{text}" if image is not None else text
messages = [{"role": "user", "content": text_content}]
if tokenizer.chat_template is not None:
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
if image is not None:
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
else:
input_ids = tokenizer(text, return_tensors="pt").input_ids
attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
result = {"input_ids": input_ids, "attention_mask": attention_mask}
if image is not None:
Expand Down
4 changes: 3 additions & 1 deletion optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,9 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
image = Image.open(requests.get(image_url, stream=True).raw)

try:
inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer)
inputs = self.model.preprocess_inputs(
text=instruction, image=image, processor=processor, tokenizer=tokenizer
)
except ValueError as value_error:
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
raise tokenizer_error
Expand Down
Loading

0 comments on commit 5c879b9

Please sign in to comment.