Merged with main

huggingface · Dec 15, 2023 · f9800b7 · f9800b7
2 parents 4fff849 + 478ad69
commit f9800b7
Show file tree

Hide file tree

Showing 19 changed files with 801 additions and 95 deletions.
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml
diff --git a/README.md b/README.md
@@ -75,12 +75,13 @@ It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision.
+If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --int8 ov_model
 ```
 
+To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
 

diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx
@@ -102,7 +102,7 @@ You can also apply INT8 quantization on your models weights when exporting your
 optimum-cli export openvino --model gpt2 --int8 ov_model
 ```
 
-This will results in the exported model linear and embedding layers to be quanrtized to INT8, the activations will be kept in floating point precision.
+This will results in the exported model linear and embedding layers to be quantized to INT8, the activations will be kept in floating point precision.
 
 This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method.
 
@@ -454,3 +454,25 @@ refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=Tr
 image = base(prompt=prompt, output_type="latent").images[0]
 image = refiner(prompt=prompt, image=image[None, :]).images[0]
 ```
+
+
+## Latent Consistency Models
+
+
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVLatentConsistencyModelPipeline`   |
+
+
+### Text-to-Image
+
+Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO :
+
+```python
+from optimum.intel import OVLatentConsistencyModelPipeline
+
+model_id = "SimianLuo/LCM_Dreamshaper_v7"
+pipeline = OVLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
+prompt = "sailing ship in storm by Leonardo da Vinci"
+images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
+```
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -26,10 +26,19 @@
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 
-from ...intel.utils.import_utils import is_nncf_available
+from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version
 from .convert import export_models
 
 
+if is_optimum_version(">=", "1.16.0"):
+    from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
+else:
+    # Copied from https://github.com/huggingface/optimum/blob/main/optimum/exporters/onnx/constants.py
+    SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [
+        "bart",
+        "whisper",
+    ]
+
 OV_XML_FILE_NAME = "openvino_model.xml"
 
 _MAX_UNCOMPRESSED_SIZE = 1e9
@@ -143,10 +152,12 @@ def main_export(
     do_gptq_patching = False
     try:
         config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+        model_type = config.model_type.replace("_", "-")
         config_dict = config.to_dict()
         quantization_config = config_dict.get("quantization_config", None)
         do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
     except Exception:
+        model_type = None
         pass
 
     if do_gptq_patching:
@@ -195,6 +206,10 @@ class StoreAttr(object):
                 f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
             )
 
+    loading_kwargs = {}
+    if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
+        loading_kwargs["attn_implementation"] = "eager"
+
     model = TasksManager.get_model_from_task(
         task,
         model_name_or_path,
@@ -207,6 +222,7 @@ class StoreAttr(object):
         trust_remote_code=trust_remote_code,
         framework=framework,
         device=device,
+        **loading_kwargs,
     )
 
     custom_architecture = False

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -31,7 +31,7 @@
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
 from optimum.utils import is_diffusers_available
 
-from ...intel.utils.import_utils import is_nncf_available
+from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
 from .utils import (
     OV_XML_FILE_NAME,
     clear_class_registry,
@@ -346,8 +346,10 @@ def export_pytorch(
             # model.config.torchscript = True can not be used for patching, because it overrides return_dict to Flase
             if custom_patcher or dict_inputs:
                 patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
-                # DecoderModelPatcher does not override model forward
-                if isinstance(patcher, DecoderModelPatcher) or patcher.orig_forward_name != "forward":
+                # DecoderModelPatcher does not override model forward in optimum < 1.15
+                if (
+                    isinstance(patcher, DecoderModelPatcher) and is_optimum_version("<", "1.15.0")
+                ) or patcher.orig_forward_name != "forward":
                     patch_model_forward = True
                     patched_forward = model.forward
                 else:

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -99,6 +99,7 @@
             "OVModelForPix2Struct",
             "OVModelForQuestionAnswering",
             "OVModelForSeq2SeqLM",
+            "OVModelForSpeechSeq2Seq",
             "OVModelForSequenceClassification",
             "OVModelForTokenClassification",
         ]
@@ -195,6 +196,7 @@
             OVModelForQuestionAnswering,
             OVModelForSeq2SeqLM,
             OVModelForSequenceClassification,
+            OVModelForSpeechSeq2Seq,
             OVModelForTokenClassification,
         )
 

diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
@@ -44,12 +44,24 @@
 logger = logging.getLogger(__name__)
 
 
+def get_float_type(model_dtype: torch.dtype):
+    if model_dtype == torch.bfloat16:
+        return "bf16"
+    elif model_dtype == torch.float16:
+        return "fp16"
+    else:
+        return "fp32"
+
+
 def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = False):
     task = _TASK_ALIASES.get(task, task)
     signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__)
     onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
+    float_dtype = get_float_type(model.dtype)
     if "text-generation" in task:
-        onnx_config = onnx_config_class(model.config, use_past=use_cache, use_past_in_inputs=use_cache)
+        onnx_config = onnx_config_class(
+            model.config, use_past=use_cache, use_past_in_inputs=use_cache, float_dtype=float_dtype
+        )
     else:
         onnx_config = onnx_config_class(model.config)
 

diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
@@ -164,7 +164,7 @@ def _from_pretrained(
         if q_config is None:
             model = model_class.from_pretrained(model_save_dir)
         else:
-            init_contexts = [no_init_weights(_enable=True)]
+            init_contexts = [no_init_weights(_enable=False)]
             with ContextManagers(init_contexts):
                 model = model_class(config)
             try:

diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
@@ -70,7 +70,7 @@
 from optimum.exporters import TasksManager
 
 from ..utils.constant import _TASK_ALIASES, MIN_QDQ_ONNX_OPSET, ONNX_WEIGHTS_NAME, TRAINING_ARGS_NAME
-from ..utils.import_utils import is_neural_compressor_version
+from ..utils.import_utils import is_neural_compressor_version, is_transformers_version
 from .configuration import INCConfig
 
 
@@ -207,6 +207,9 @@ def _inner_training_loop(
     ):
         self.accelerator.free_memory()
         self._train_batch_size = batch_size
+
+        if self.args.auto_find_batch_size:
+            self.state.train_batch_size = self._train_batch_size
         logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
@@ -260,7 +263,10 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+        is_fsdp_xla_enabled = (
+            self.is_fsdp_xla_enabled if is_transformers_version(">=", "4.36.0") else self.fsdp is not None
+        )
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         if self.is_deepspeed_enabled:
             self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
@@ -270,6 +276,7 @@ def _inner_training_loop(
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
+        self.state.train_batch_size = self._train_batch_size
 
         # Compute absolute values for logging, eval, and save if given as ratio
         if args.logging_steps is not None:
@@ -305,7 +312,7 @@ def _inner_training_loop(
         use_accelerator_prepare = True if model is self.model else False
 
         if delay_optimizer_creation:
-            if use_accelerator_prepare:
+            if is_transformers_version("<", "4.36.0") and use_accelerator_prepare:
                 self.model = self.accelerator.prepare(self.model)
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
@@ -473,6 +480,18 @@ def _inner_training_loop(
             step = -1
             for step, inputs in enumerate(epoch_iterator):
                 total_batched_samples += 1
+
+                if is_transformers_version(">=", "4.36.0") and self.args.include_num_input_tokens_seen:
+                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
+                    if main_input_name not in inputs:
+                        logger.warning(
+                            "Tried to track the number of tokens seen, however the current model is "
+                            "not configured properly to know what item is the input. To fix this, add "
+                            "a `main_input_name` attribute to the model class you are using."
+                        )
+                    else:
+                        self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
+
                 if rng_to_sync:
                     self._load_rng_state(resume_from_checkpoint)
                     rng_to_sync = False
@@ -521,9 +540,7 @@ def _inner_training_loop(
                 ):
                     # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                     # in accelerate. So, explicitly enable sync gradients to True in that case.
-                    if is_last_step_and_steps_less_than_grad_acc or (
-                        version.parse(accelerate_version) <= version.parse("0.20.3")
-                    ):
+                    if is_last_step_and_steps_less_than_grad_acc:
                         self.accelerator.gradient_state._set_sync_gradients(True)
 
                     # Gradient clipping

diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
@@ -46,7 +46,7 @@
     OVModelForTokenClassification,
 )
 from .modeling_decoder import OVModelForCausalLM
-from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM
+from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq
 
 
 if is_diffusers_available():

diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -68,8 +68,6 @@ def __init__(
         self.ov_config = ov_config if ov_config is not None else {}
         self.preprocessors = kwargs.get("preprocessors", [])
 
-        if "GPU" in self._device:
-            raise ValueError("Support of dynamic shapes for GPU devices is not yet available.")
         if self.is_dynamic:
             encoder = self._reshape(encoder, -1, -1, is_decoder=False)
             decoder = self._reshape(decoder, -1, -1)