Skip to content

Commit

Permalink
Merged with main
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKoff88 committed Dec 15, 2023
2 parents 4fff849 + 478ad69 commit f9800b7
Show file tree
Hide file tree
Showing 19 changed files with 801 additions and 95 deletions.
18 changes: 0 additions & 18 deletions .github/workflows/delete_doc_comment.yml

This file was deleted.

12 changes: 0 additions & 12 deletions .github/workflows/delete_doc_comment_trigger.yml

This file was deleted.

3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,13 @@ It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2
optimum-cli export openvino --model gpt2 ov_model
```

If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision.
If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.

```plain
optimum-cli export openvino --model gpt2 --int8 ov_model
```

To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).

#### Inference:

Expand Down
24 changes: 23 additions & 1 deletion docs/source/inference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ You can also apply INT8 quantization on your models weights when exporting your
optimum-cli export openvino --model gpt2 --int8 ov_model
```

This will results in the exported model linear and embedding layers to be quanrtized to INT8, the activations will be kept in floating point precision.
This will results in the exported model linear and embedding layers to be quantized to INT8, the activations will be kept in floating point precision.

This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method.

Expand Down Expand Up @@ -454,3 +454,25 @@ refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=Tr
image = base(prompt=prompt, output_type="latent").images[0]
image = refiner(prompt=prompt, image=image[None, :]).images[0]
```


## Latent Consistency Models


| Task | Auto Class |
|--------------------------------------|--------------------------------------|
| `text-to-image` | `OVLatentConsistencyModelPipeline` |


### Text-to-Image

Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO :

```python
from optimum.intel import OVLatentConsistencyModelPipeline

model_id = "SimianLuo/LCM_Dreamshaper_v7"
pipeline = OVLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
prompt = "sailing ship in storm by Leonardo da Vinci"
images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
```
18 changes: 17 additions & 1 deletion optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,19 @@
from optimum.utils import DEFAULT_DUMMY_SHAPES
from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors

from ...intel.utils.import_utils import is_nncf_available
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version
from .convert import export_models


if is_optimum_version(">=", "1.16.0"):
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
else:
# Copied from https://github.com/huggingface/optimum/blob/main/optimum/exporters/onnx/constants.py
SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [
"bart",
"whisper",
]

OV_XML_FILE_NAME = "openvino_model.xml"

_MAX_UNCOMPRESSED_SIZE = 1e9
Expand Down Expand Up @@ -143,10 +152,12 @@ def main_export(
do_gptq_patching = False
try:
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
model_type = config.model_type.replace("_", "-")
config_dict = config.to_dict()
quantization_config = config_dict.get("quantization_config", None)
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
except Exception:
model_type = None
pass

if do_gptq_patching:
Expand Down Expand Up @@ -195,6 +206,10 @@ class StoreAttr(object):
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)

loading_kwargs = {}
if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
loading_kwargs["attn_implementation"] = "eager"

model = TasksManager.get_model_from_task(
task,
model_name_or_path,
Expand All @@ -207,6 +222,7 @@ class StoreAttr(object):
trust_remote_code=trust_remote_code,
framework=framework,
device=device,
**loading_kwargs,
)

custom_architecture = False
Expand Down
8 changes: 5 additions & 3 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
from optimum.utils import is_diffusers_available

from ...intel.utils.import_utils import is_nncf_available
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
from .utils import (
OV_XML_FILE_NAME,
clear_class_registry,
Expand Down Expand Up @@ -346,8 +346,10 @@ def export_pytorch(
# model.config.torchscript = True can not be used for patching, because it overrides return_dict to Flase
if custom_patcher or dict_inputs:
patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
# DecoderModelPatcher does not override model forward
if isinstance(patcher, DecoderModelPatcher) or patcher.orig_forward_name != "forward":
# DecoderModelPatcher does not override model forward in optimum < 1.15
if (
isinstance(patcher, DecoderModelPatcher) and is_optimum_version("<", "1.15.0")
) or patcher.orig_forward_name != "forward":
patch_model_forward = True
patched_forward = model.forward
else:
Expand Down
2 changes: 2 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
"OVModelForPix2Struct",
"OVModelForQuestionAnswering",
"OVModelForSeq2SeqLM",
"OVModelForSpeechSeq2Seq",
"OVModelForSequenceClassification",
"OVModelForTokenClassification",
]
Expand Down Expand Up @@ -195,6 +196,7 @@
OVModelForQuestionAnswering,
OVModelForSeq2SeqLM,
OVModelForSequenceClassification,
OVModelForSpeechSeq2Seq,
OVModelForTokenClassification,
)

Expand Down
14 changes: 13 additions & 1 deletion optimum/intel/generation/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,24 @@
logger = logging.getLogger(__name__)


def get_float_type(model_dtype: torch.dtype):
if model_dtype == torch.bfloat16:
return "bf16"
elif model_dtype == torch.float16:
return "fp16"
else:
return "fp32"


def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = False):
task = _TASK_ALIASES.get(task, task)
signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__)
onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
float_dtype = get_float_type(model.dtype)
if "text-generation" in task:
onnx_config = onnx_config_class(model.config, use_past=use_cache, use_past_in_inputs=use_cache)
onnx_config = onnx_config_class(
model.config, use_past=use_cache, use_past_in_inputs=use_cache, float_dtype=float_dtype
)
else:
onnx_config = onnx_config_class(model.config)

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _from_pretrained(
if q_config is None:
model = model_class.from_pretrained(model_save_dir)
else:
init_contexts = [no_init_weights(_enable=True)]
init_contexts = [no_init_weights(_enable=False)]
with ContextManagers(init_contexts):
model = model_class(config)
try:
Expand Down
29 changes: 23 additions & 6 deletions optimum/intel/neural_compressor/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
from optimum.exporters import TasksManager

from ..utils.constant import _TASK_ALIASES, MIN_QDQ_ONNX_OPSET, ONNX_WEIGHTS_NAME, TRAINING_ARGS_NAME
from ..utils.import_utils import is_neural_compressor_version
from ..utils.import_utils import is_neural_compressor_version, is_transformers_version
from .configuration import INCConfig


Expand Down Expand Up @@ -207,6 +207,9 @@ def _inner_training_loop(
):
self.accelerator.free_memory()
self._train_batch_size = batch_size

if self.args.auto_find_batch_size:
self.state.train_batch_size = self._train_batch_size
logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
# Data loader and number of training steps
train_dataloader = self.get_train_dataloader()
Expand Down Expand Up @@ -260,7 +263,10 @@ def _inner_training_loop(
else:
debug_overflow = DebugUnderflowOverflow(self.model) # noqa

delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
is_fsdp_xla_enabled = (
self.is_fsdp_xla_enabled if is_transformers_version(">=", "4.36.0") else self.fsdp is not None
)
delay_optimizer_creation = is_sagemaker_mp_enabled() or is_fsdp_xla_enabled or self.is_fsdp_enabled

if self.is_deepspeed_enabled:
self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
Expand All @@ -270,6 +276,7 @@ def _inner_training_loop(

self.state = TrainerState()
self.state.is_hyper_param_search = trial is not None
self.state.train_batch_size = self._train_batch_size

# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps is not None:
Expand Down Expand Up @@ -305,7 +312,7 @@ def _inner_training_loop(
use_accelerator_prepare = True if model is self.model else False

if delay_optimizer_creation:
if use_accelerator_prepare:
if is_transformers_version("<", "4.36.0") and use_accelerator_prepare:
self.model = self.accelerator.prepare(self.model)
self.create_optimizer_and_scheduler(num_training_steps=max_steps)

Expand Down Expand Up @@ -473,6 +480,18 @@ def _inner_training_loop(
step = -1
for step, inputs in enumerate(epoch_iterator):
total_batched_samples += 1

if is_transformers_version(">=", "4.36.0") and self.args.include_num_input_tokens_seen:
main_input_name = getattr(self.model, "main_input_name", "input_ids")
if main_input_name not in inputs:
logger.warning(
"Tried to track the number of tokens seen, however the current model is "
"not configured properly to know what item is the input. To fix this, add "
"a `main_input_name` attribute to the model class you are using."
)
else:
self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()

if rng_to_sync:
self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False
Expand Down Expand Up @@ -521,9 +540,7 @@ def _inner_training_loop(
):
# the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
# in accelerate. So, explicitly enable sync gradients to True in that case.
if is_last_step_and_steps_less_than_grad_acc or (
version.parse(accelerate_version) <= version.parse("0.20.3")
):
if is_last_step_and_steps_less_than_grad_acc:
self.accelerator.gradient_state._set_sync_gradients(True)

# Gradient clipping
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
OVModelForTokenClassification,
)
from .modeling_decoder import OVModelForCausalLM
from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM
from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq


if is_diffusers_available():
Expand Down
2 changes: 0 additions & 2 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ def __init__(
self.ov_config = ov_config if ov_config is not None else {}
self.preprocessors = kwargs.get("preprocessors", [])

if "GPU" in self._device:
raise ValueError("Support of dynamic shapes for GPU devices is not yet available.")
if self.is_dynamic:
encoder = self._reshape(encoder, -1, -1, is_decoder=False)
decoder = self._reshape(decoder, -1, -1)
Expand Down
Loading

0 comments on commit f9800b7

Please sign in to comment.