diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 6d709eecfd..ba5b09ff81 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -36,3 +36,9 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/ --ignore test_modeling_basic + - name: Test openvino-nightly + run: | + pip uninstall -y openvino + pip install openvino-nightly + python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" + optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml index abc2a65440..7b037d0565 100644 --- a/.github/workflows/test_openvino_notebooks.yml +++ b/.github/workflows/test_openvino_notebooks.yml @@ -49,5 +49,7 @@ jobs: - name: Test with Pytest run: | + sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb + sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index 65480c1d2f..e0b60baa2e 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -99,21 +99,22 @@ tokenizer.save_pretrained(save_directory) ### Weight-only quantization -You can also apply 8-bit or 4-bit weight quantization when exporting your model with the CLI by setting the `weight-format` argument to respectively `int8` or `int4`: +You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when exporting your model with the CLI by setting `--weight-format` to respectively `fp16`, `int8` or `int4`: ```bash optimum-cli export openvino --model gpt2 --weight-format int8 ov_model ``` -This will result in the exported model linear and embedding layers to be quantized to INT8 or INT4, the activations will be kept in floating point precision. This type of optimization allows reducing the footprint and latency of LLMs. +This type of optimization allows to reduce the memory footprint and inference latency. -By default the quantization scheme will be [assymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`. + +By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`. For INT4 quantization you can also specify the following arguments : * The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization. * The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. -Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency. +Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency. You can also apply 8-bit quantization on your model's weight when loading your model by setting the `load_in_8bit=True` argument when calling the `from_pretrained()` method. @@ -125,7 +126,7 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) -`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`. diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 70c98f14f7..1e78c36805 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -19,15 +19,72 @@ limitations under the License. 🤗 Optimum Intel provides an `openvino` package that enables you to apply a variety of model compression methods such as quantization, pruning, on many models hosted on the 🤗 hub using the [NNCF](https://docs.openvino.ai/2022.1/docs_nncf_introduction.html) framework. -## Post-training optimization +## Post-training -Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. -Here is how to apply static quantization on a fine-tuned DistilBERT: +Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and / or the activations with lower precision data types like 8-bit or 4-bit. + +### Weight-only quantization + +Quantization can be applied on the model's Linear, Convolutional and Embedding layers, enabling the loading of large models on memory-limited devices. For example, when applying 8-bit quantization, the resulting model will be x4 smaller than its fp32 counterpart. For 4-bit quantization, the reduction in memory could theoretically reach x8, but is closer to x6 in practice. + + +#### 8-bit + +For the 8-bit weight quantization you can set `load_in_8bit=True` to load your model's weights in 8-bit: ```python -from functools import partial -from transformers import AutoTokenizer -from optimum.intel import OVConfig, OVQuantizer, OVModelForSequenceClassification, +from optimum.intel import OVModelForCausalLM + +model_id = "helenai/gpt2-ov" +model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) + +# Saves the int8 model that will be x4 smaller than its fp32 counterpart +model.save_pretrained(saving_directory) +``` + + + +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`. + + + +You can also provide a `quantization_config` instead to specify additional optimization parameters. + +#### 4-bit + +For the 4-bit weight quantization, you need a `quantization_config` to define the optimization parameters, for example: + +```python +from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig + +quantization_config = OVWeightQuantizationConfig(bits=4) +model = OVModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) +``` + +You can tune quantization parameters to achieve a better performance accuracy trade-off as follows: + +```python +quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb") +``` + +By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `sym=True`. + +For 4-bit quantization you can also specify the following arguments in the quantization configuration : +* The `group_size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization. +* The `ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. + +Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency. + +### Static quantization + +When applying post-training static quantization, both the weights and the activations are quantized. +To apply quantization on the activations, an additional calibration step is needed which consists in feeding a `calibration_dataset` to the network in order to estimate the quantization activations parameters. + +Here is how to apply static quantization on a fine-tuned DistilBERT given your own `calibration_dataset`: + +```python +from transformers import AutoTokenizer +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, model_id = "distilbert-base-uncased-finetuned-sst-2-english" model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) @@ -35,11 +92,22 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # The directory where the quantized model will be saved save_dir = "ptq_model" +quantizer = OVQuantizer.from_pretrained(model) + +# Apply static quantization and export the resulting quantized model to OpenVINO IR format +quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +# Save the tokenizer +tokenizer.save_pretrained(save_dir) +``` + +The calibration dataset can also be created easily using your `OVQuantizer`: + +```python +from functools import partial + def preprocess_function(examples, tokenizer): return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True) -# Instantiate our OVQuantizer using the desired configuration -quantizer = OVQuantizer.from_pretrained(model) # Create the calibration dataset used to perform static quantization calibration_dataset = quantizer.get_calibration_dataset( "glue", @@ -48,33 +116,23 @@ calibration_dataset = quantizer.get_calibration_dataset( num_samples=300, dataset_split="train", ) -# Apply static quantization and export the resulting quantized model to OpenVINO IR format -quantizer.quantize( - calibration_dataset=calibration_dataset, - save_directory=save_dir, -) -# Save the tokenizer -tokenizer.save_pretrained(save_dir) ``` -The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. -## Weight-only quantization +The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. -You can optimize the performance of text-generation LLMs by quantizing weights to various precisions that provide different performance-accuracy trade-offs. -```python -from optimum.intel import OVModelForCausalLM +### Hybrid quantization -model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) -``` +Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion (SD) models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights. +The U-Net component takes up most of the overall execution time of the pipeline. Thus, optimizing just this one component can bring substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial accuracy degradation. +Therefore, the proposal is to apply quantization in *hybrid mode* for the U-Net model and weight-only quantization for the rest of the pipeline components : +* U-Net : quantization applied on both the weights and activations +* The text encoder, VAE encoder / decoder : quantization applied on the weights -## Hybrid quantization +The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size. -Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights. -The UNet model takes up most of the overall execution time of the pipeline. Thus, optimizing just one model brings substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial degradation of accuracy. -Therefore, the proposal is to apply quantization in *hybrid mode* for the UNet model and weight-only quantization for the rest of the pipeline components. The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size. -The `quantization_config` is utilized to define optimization parameters for optimizing the Stable Diffusion pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. Otherwise, weight-only quantization to a specified data type (8 tr 4 bits) is applied to UNet model. +The `quantization_config` is utilized to define optimization parameters for optimizing the SD pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. If the dataset is not defined, weight-only quantization will be applied on all components. ```python from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig @@ -86,38 +144,11 @@ model = OVStableDiffusionPipeline.from_pretrained( ) ``` - - -`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. - - - -For the 4-bit weight quantization you can use the `quantization_config` to specify the optimization parameters, for example: - -```python -from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig - -model = OVModelForCausalLM.from_pretrained( - model_id, - quantization_config=OVWeightQuantizationConfig(bits=4), -) -``` - -You can tune quantization parameters to achieve a better performance accuracy trade-off as follows: - -```python -from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig - -model = OVModelForCausalLM.from_pretrained( - model_id, - quantization_config=OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb"), -) -``` For more details, please refer to the corresponding NNCF [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md). -## Training-time optimization +## Training-time Apart from optimizing a model after training like post-training quantization above, `optimum.openvino` also provides optimization methods during training, namely Quantization-Aware Training (QAT) and Joint Pruning, Quantization and Distillation (JPQD). diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 446e668911..b94238d358 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -9,7 +9,7 @@ "\n", "This notebook is a playground for running inference with OpenVINO on Transformers models with Optimum. The first part of this notebook explains the different ways to load a model, and some options specific to OpenVINO, like doing inference on an Intel GPU. The second part of this notebook consists of small examples for different supported tasks. \n", "\n", - "Do not forget to install the required dependencies before running this notebook with `pip install optimum[openvino] ipywidgets pillow torchaudio` or uncomment the cell below to install these requirements in your current Python environment. The audio classification example requires [ffmpeg](https://ffmpeg.org/download.html)." + "Do not forget to install the required dependencies before running this notebook by uncommenting the cell below to install these requirements in your current Python environment. The audio classification example requires [ffmpeg](https://ffmpeg.org/download.html)." ] }, { @@ -17,18 +17,11 @@ "execution_count": 1, "id": "6a6774ad-912b-4053-b7f6-14dc020807ef", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:17.121564Z", - "iopub.status.busy": "2023-03-12T19:47:17.121264Z", - "iopub.status.idle": "2023-03-12T19:47:17.125098Z", - "shell.execute_reply": "2023-03-12T19:47:17.124669Z", - "shell.execute_reply.started": "2023-03-12T19:47:17.121531Z" - }, "tags": [] }, "outputs": [], "source": [ - "# %pip install optimum[openvino] ipywidgets pillow torchaudio" + "# %pip install optimum[openvino] ipywidgets pillow torchaudio soundfile librosa" ] }, { @@ -52,13 +45,6 @@ "execution_count": 2, "id": "0c89b2a2-ce31-4773-9454-3e0e57d1a231", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:17.127386Z", - "iopub.status.busy": "2023-03-12T19:47:17.127169Z", - "iopub.status.idle": "2023-03-12T19:47:24.576408Z", - "shell.execute_reply": "2023-03-12T19:47:24.575840Z", - "shell.execute_reply.started": "2023-03-12T19:47:17.127372Z" - }, "tags": [] }, "outputs": [ @@ -66,15 +52,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/openvino/offline_transformations/__init__.py:10: FutureWarning: The module is private and following namespace `offline_transformations` will be removed in the future.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n" + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "Framework not specified. Using pt to export the model.\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.2.0+cpu\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:246: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " mask, torch.tensor(torch.finfo(scores.dtype).min)\n", + "Compiling the model to CPU ...\n", + "Compiling the model to CPU ...\n" ] } ], @@ -104,13 +101,6 @@ "execution_count": 3, "id": "8053abe3-0e1f-445d-8397-630efac28269", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:24.577918Z", - "iopub.status.busy": "2023-03-12T19:47:24.577422Z", - "iopub.status.idle": "2023-03-12T19:47:25.276093Z", - "shell.execute_reply": "2023-03-12T19:47:25.275685Z", - "shell.execute_reply.started": "2023-03-12T19:47:24.577895Z" - }, "tags": [] }, "outputs": [ @@ -154,13 +144,6 @@ "execution_count": 4, "id": "dcf7a5c3-81ba-42cb-a7d9-d22c5bb00325", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:25.276779Z", - "iopub.status.busy": "2023-03-12T19:47:25.276603Z", - "iopub.status.idle": "2023-03-12T19:47:25.278703Z", - "shell.execute_reply": "2023-03-12T19:47:25.278355Z", - "shell.execute_reply.started": "2023-03-12T19:47:25.276764Z" - }, "tags": [] }, "outputs": [], @@ -174,20 +157,20 @@ "execution_count": 5, "id": "648a8eb1-1d50-4503-8094-c9a88098bee9", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:25.279302Z", - "iopub.status.busy": "2023-03-12T19:47:25.279139Z", - "iopub.status.idle": "2023-03-12T19:47:26.802333Z", - "shell.execute_reply": "2023-03-12T19:47:26.801969Z", - "shell.execute_reply.started": "2023-03-12T19:47:25.279288Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.8515876531600952,\n", + "{'score': 0.8515874147415161,\n", " 'start': 12,\n", " 'end': 64,\n", " 'answer': 'a framework for deep learning inference optimization'}" @@ -234,16 +217,16 @@ "execution_count": 6, "id": "c5d8d4be-3449-4a78-aa92-56ef2b355572", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:26.802936Z", - "iopub.status.busy": "2023-03-12T19:47:26.802808Z", - "iopub.status.idle": "2023-03-12T19:47:27.259642Z", - "shell.execute_reply": "2023-03-12T19:47:27.259125Z", - "shell.execute_reply.started": "2023-03-12T19:47:26.802923Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ @@ -285,10 +268,10 @@ "### OpenVINO features\n", "\n", "- For improved performance, it is sometimes useful to reshape the model to use static input shapes\n", - "- Models can be compressed to FP16, which reduces model size by half, and improves performance on GPU (because GPUs contain optimizations for computations with FP16 data).\n", + "- On GPU, inference uses FP16 by default (GPUs contain optimizations for computations with FP16 data). On 4th generation and later Intel® Xeon® Scalable Processors, inference uses BF16 by default. \n", "- OpenVINO supports inference on Intel GPU, either an integrated GPU in your laptop or desktop, or an Intel discrete GPU, for example Intel Arc. \n", "\n", - "By default, when loading a model with `model = OVModelForXxx.from_pretrained(model_id)`, it is compiled on CPU. If you know you want to use GPU inference, static shapes, or FP16, you can set `compile=False` to the `.from_pretrained()` method, to skip the compilation step, as the model will have to be compiled again after steps such as reshaping, fp16 conversion or changing device. The model can then be compile with `model.compile()`. In the case the model was not compiled, it will be automatically done before the first inference, resulting in an increase of the first inference latency, since it will include the model compilation time." + "By default, when loading a model with `model = OVModelForXxx.from_pretrained(model_id)`, it is compiled on CPU. If you need to modify the model, for example to use static shapes, you can set `compile=False` to the `.from_pretrained()` method, to skip the compilation step, as the model will have to be compiled again after steps such as reshaping or changing device. The model can then be compile with `model.compile()`. In the case the model was not compiled, it will be automatically done before the first inference, resulting in an increase of the first inference latency, since it will include the model compilation time." ] }, { @@ -316,20 +299,20 @@ "execution_count": 7, "id": "e0754efa-0beb-4060-8633-daecc5ebca31", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:27.261759Z", - "iopub.status.busy": "2023-03-12T19:47:27.261529Z", - "iopub.status.idle": "2023-03-12T19:47:31.027499Z", - "shell.execute_reply": "2023-03-12T19:47:31.027078Z", - "shell.execute_reply.started": "2023-03-12T19:47:27.261738Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.7991431951522827,\n", + "{'score': 0.7991424798965454,\n", " 'start': 12,\n", " 'end': 62,\n", " 'answer': 'a toolkit for deep learning inference optimization'}" @@ -366,12 +349,12 @@ }, { "cell_type": "markdown", - "id": "3a564882-dfd2-4f62-803b-f1e3485736c3", + "id": "8798b8d5-50ad-439f-a1d8-886d579a91fe", "metadata": {}, "source": [ - "#### Compressing model weights to FP16\n", + "#### Saving Model in FP16 format\n", "\n", - "Compressing model weights saves disk space, and speeds up inference on Intel GPU." + "`model.half()` converts the model weights to FP16 precision. This reduces the size of the model by half, with usually a negligible impact on accuracy." ] }, { @@ -379,13 +362,6 @@ "execution_count": 8, "id": "09f443d5-ff58-416c-ab70-bee16e9f8235", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:31.028314Z", - "iopub.status.busy": "2023-03-12T19:47:31.028164Z", - "iopub.status.idle": "2023-03-12T19:47:32.026899Z", - "shell.execute_reply": "2023-03-12T19:47:32.026017Z", - "shell.execute_reply.started": "2023-03-12T19:47:31.028301Z" - }, "tags": [] }, "outputs": [], @@ -401,7 +377,16 @@ "source": [ "#### Loading Model on GPU\n", "\n", - "For GPU inference, we recommend using FP16. OpenVINO support for dynamic shapes on GPU is in preview mode, so for now we recommend using static shapes.\n", + "A model can be loaded to GPU by using `model.to(\"GPU\")` or by passing `device` to `from_pretrained()`.\n", + "\n", + "GPU inference will automatically run with FP16 precision, regardless of the precision of the weights of the model. To override this, and force FP32 precision you can pass an `ov_config` argument to `.from_pretrained()`: \n", + "\n", + "```\n", + "model = OVModelForQuestionAnswering.from_pretrained(model_id,\n", + " device_name=\"GPU\",\n", + " ov_config={\"INFERENCE_PRECISION_HINT\": \"f32\"}\n", + ")\n", + "```\n", "\n", "OpenVINO's `Core().available_devices` property shows the supported devices on the system. " ] @@ -411,13 +396,6 @@ "execution_count": 9, "id": "09f13ef7-2321-47c6-a08f-5fbe61b32b43", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:32.029229Z", - "iopub.status.busy": "2023-03-12T19:47:32.028882Z", - "iopub.status.idle": "2023-03-12T19:47:32.061033Z", - "shell.execute_reply": "2023-03-12T19:47:32.060524Z", - "shell.execute_reply.started": "2023-03-12T19:47:32.029205Z" - }, "tags": [] }, "outputs": [ @@ -426,7 +404,7 @@ "output_type": "stream", "text": [ "CPU 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz\n", - "GPU Intel(R) Iris(R) Xe Graphics [0x9a49] (iGPU)\n" + "GPU Intel(R) Iris(R) Xe Graphics (iGPU)\n" ] } ], @@ -442,16 +420,17 @@ "execution_count": 10, "id": "bf5194b5-5e80-4597-85d4-fab72fbed2fa", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:32.061881Z", - "iopub.status.busy": "2023-03-12T19:47:32.061689Z", - "iopub.status.idle": "2023-03-12T19:47:33.067283Z", - "shell.execute_reply": "2023-03-12T19:47:33.066928Z", - "shell.execute_reply.started": "2023-03-12T19:47:32.061868Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to GPU ...\n", + "Setting OpenVINO CACHE_DIR to /home/helena/.cache/huggingface/hub/models--helenai--distilbert-base-uncased-distilled-squad-ov-fp32/snapshots/a9da64102a84c4b3f110c4d627937a110e56257f/model_cache\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -461,15 +440,35 @@ } ], "source": [ - "# Compile the model on GPU if a GPU is found\n", + "# Use `model.to()` to compile the model on GPU if a GPU is found\n", "if \"GPU\" in Core().available_devices:\n", - " model.half()\n", " model.reshape(1, 28)\n", " model.to(\"gpu\")\n", " model.compile()\n", " print(ov_pipe.model._device)" ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2538c02e-fa35-459f-90fb-e6667ef8747b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to GPU ...\n", + "Setting OpenVINO CACHE_DIR to distilbert-base-uncased-distilled-squad-ov-fp16/model_cache\n" + ] + } + ], + "source": [ + "# Set the device directly with `.from_pretrained()`\n", + "if \"GPU\" in Core().available_devices:\n", + " model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")" + ] + }, { "cell_type": "markdown", "id": "8e3a06fd-e51b-404e-b82a-2557e3db1375", @@ -507,23 +506,48 @@ "source": [ "Audio classification is the task of automatically categorizing audio data into classes or categories. See Hugging Face's [audio-classification](https://huggingface.co/tasks/audio-classificationhttps://huggingface.co/tasks/audio-classification) documentation for more information.\n", "\n", - "In this example, we use the [MIT/ast-finetuned-speech-commands-v2](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) model to do inference on an audio file from the [speech commands](https://huggingface.co/datasets/speech_commands/viewer/v0.01/test) dataset. You can try your own audio file too. To see the classes that this model was trained on, run `model.config.id2label`\n", + "In this example, we use the [MIT/ast-finetuned-speech-commands-v2](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) model to do inference on an audio file from the [speech commands](https://huggingface.co/datasets/speech_commands/viewer/v0.01/test) dataset. You can try your own audio file too. To do that, set `audio_sample = /path/to/audio_file`. To see the classes that this model was trained on, run `model.config.id2label`\n", "\n", "The model pipeline needs ffmpeg. On Ubuntu Linux: `sudo apt install ffmpeg`; see https://ffmpeg.org/download.html for other OSs" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "570fd296-15a1-437d-97db-76b91407dc3c", + "execution_count": 12, + "id": "6b725d0a-e230-4b0e-b6b6-3d6d81eb984d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from IPython.display import Audio\n", + "from optimum.intel.openvino import OVModelForAudioClassification\n", + "from transformers import AutoFeatureExtractor, pipeline\n", + "from datasets import load_dataset\n", + "\n", + "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", + "model = OVModelForAudioClassification.from_pretrained(model_id)\n", + "feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)\n", + "ov_pipe = pipeline(\"audio-classification\", model=model, feature_extractor=feature_extractor)\n", + "\n", + "# streaming=true enables loading one item from the dataset without downloading the full dataset\n", + "dataset = load_dataset(\"speech_commands\", \"v0.02\", streaming=True)\n", + "audio_sample = next(iter(dataset[\"test\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d3012174-b58f-4efb-bb73-a88dbfb20392", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:33.068965Z", - "iopub.status.busy": "2023-03-12T19:47:33.068575Z", - "iopub.status.idle": "2023-03-12T19:47:38.120307Z", - "shell.execute_reply": "2023-03-12T19:47:38.119775Z", - "shell.execute_reply.started": "2023-03-12T19:47:33.068947Z" - }, "tags": [] }, "outputs": [ @@ -532,7 +556,7 @@ "text/html": [ "\n", " \n", " " @@ -547,29 +571,28 @@ { "data": { "text/plain": [ - "[{'score': 0.9999880790710449, 'label': 'down'},\n", - " {'score': 7.452485419889854e-07, 'label': 'five'},\n", - " {'score': 7.436851205966377e-07, 'label': 'go'}]" + "[{'score': 0.9999935626983643, 'label': 'backward'},\n", + " {'score': 3.4823816008611175e-07, 'label': 'forward'},\n", + " {'score': 3.3890643180711777e-07, 'label': 'wow'}]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from IPython.display import Audio\n", - "from optimum.intel.openvino import OVModelForAudioClassification\n", - "from transformers import AutoFeatureExtractor, pipeline\n", + "if isinstance(audio_sample, dict):\n", + " audio_data = audio_sample[\"audio\"][\"array\"]\n", + " sampling_rate = audio_sample[\"audio\"][\"sampling_rate\"]\n", + "else:\n", + " # if audio_sample is not a dataset item, it should be the path to an audio file\n", + " audio_data = audio_sample\n", + " sampling_rate = None\n", "\n", - "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", - "model = OVModelForAudioClassification.from_pretrained(model_id)\n", - "feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)\n", - "ov_pipe = pipeline(\"audio-classification\", model=model, feature_extractor=feature_extractor)\n", + "display(Audio(audio_data, rate=sampling_rate))\n", "\n", - "audio_url_or_file = \"https://datasets-server.huggingface.co/assets/speech_commands/--/v0.01/test/38/audio/audio.mp3\"\n", - "display(Audio(audio_url_or_file))\n", - "ov_pipe(audio_url_or_file, top_k=3)" + "ov_pipe(audio_data, top_k=3)" ] }, { @@ -586,16 +609,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "5207a5af-3b53-43b3-ae5e-5b352c0f08d4", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:38.121136Z", - "iopub.status.busy": "2023-03-12T19:47:38.120930Z", - "iopub.status.idle": "2023-03-12T19:47:42.382362Z", - "shell.execute_reply": "2023-03-12T19:47:42.381733Z", - "shell.execute_reply.started": "2023-03-12T19:47:38.121116Z" - }, "tags": [] }, "outputs": [ @@ -603,18 +619,20 @@ "name": "stderr", "output_type": "stream", "text": [ + "Provided model does not contain state. It may lead to sub-optimal performance.Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model and `export=True` parameter\n", + "Compiling the model to CPU ...\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "data": { "text/plain": [ - "[{'generated_text': \"Hello, I'm a language model, so you might consider changing your code to a similar type for Python 2 or 3.\\n\\nWhy Python 2\"},\n", - " {'generated_text': \"Hello, I'm a language model, so the second statement is true or true. You can also have both. It's not strictly necessary, but\"},\n", - " {'generated_text': 'Hello, I\\'m a language model, and we\\'re seeing it all in the Java world,\" he added.\\n\\nSo what might happen next?'}]" + "[{'generated_text': \"Hello, I'm a language model, so I'm really interested in programming. I've always been a programming student. But for a long time,\"},\n", + " {'generated_text': \"Hello, I'm a language model, because I don't think I ever spoke on paper to an editor. I'm simply someone reading a paper.\"},\n", + " {'generated_text': \"Hello, I'm a language model, I understand. Your mother was talking to you. No, well, my grandmother had said that she heard '\"}]" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -648,19 +666,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "97569002-0c0a-4208-9b63-b2bab209fb81", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:42.383244Z", - "iopub.status.busy": "2023-03-12T19:47:42.383095Z", - "iopub.status.idle": "2023-03-12T19:47:48.887705Z", - "shell.execute_reply": "2023-03-12T19:47:48.887095Z", - "shell.execute_reply.started": "2023-03-12T19:47:42.383228Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "image/jpeg": "", @@ -674,12 +692,12 @@ { "data": { "text/plain": [ - "[{'score': 0.4759259521961212, 'label': 'flat-coated retriever'},\n", - " {'score': 0.10909564793109894, 'label': 'Labrador retriever'},\n", - " {'score': 0.08196048438549042, 'label': 'Great Dane'}]" + "[{'score': 0.4759257435798645, 'label': 'flat-coated retriever'},\n", + " {'score': 0.10909581184387207, 'label': 'Labrador retriever'},\n", + " {'score': 0.0819605216383934, 'label': 'Great Dane'}]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -716,33 +734,33 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "1cd9423b-3cba-4cf9-b58a-bffbde08738d", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:48.888614Z", - "iopub.status.busy": "2023-03-12T19:47:48.888417Z", - "iopub.status.idle": "2023-03-12T19:47:52.868441Z", - "shell.execute_reply": "2023-03-12T19:47:52.867833Z", - "shell.execute_reply.started": "2023-03-12T19:47:48.888595Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "[{'score': 0.0960492491722107,\n", + "[{'score': 0.09604837000370026,\n", " 'token': 4827,\n", " 'token_str': 'fashion',\n", " 'sequence': 'i am a fashion model'},\n", - " {'score': 0.09326528012752533,\n", + " {'score': 0.09326566755771637,\n", " 'token': 2535,\n", " 'token_str': 'role',\n", " 'sequence': 'i am a role model'}]" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -789,29 +807,29 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "5d9cedee-902c-41b2-b6ef-1c04bbd8498e", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:52.869399Z", - "iopub.status.busy": "2023-03-12T19:47:52.869151Z", - "iopub.status.idle": "2023-03-12T19:47:53.395475Z", - "shell.execute_reply": "2023-03-12T19:47:53.394997Z", - "shell.execute_reply.started": "2023-03-12T19:47:52.869376Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.5305245518684387,\n", + "{'score': 0.5305243730545044,\n", " 'start': 12,\n", " 'end': 75,\n", " 'answer': 'an open source toolkit for deep learning inference optimization'}" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -845,26 +863,28 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "755096b0-406d-4167-8642-5b4b093e2bc5", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:53.396122Z", - "iopub.status.busy": "2023-03-12T19:47:53.395987Z", - "iopub.status.idle": "2023-03-12T19:48:00.052492Z", - "shell.execute_reply": "2023-03-12T19:48:00.052125Z", - "shell.execute_reply.started": "2023-03-12T19:47:53.396109Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the encoder to CPU ...\n", + "Compiling the decoder to CPU ...\n", + "Compiling the decoder to CPU ...\n" + ] + }, { "data": { "text/plain": [ "'Das Haus ist wunderbar.'" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -884,16 +904,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "c0f096f2-0cd3-40ab-9661-1c7209c6a670", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.053145Z", - "iopub.status.busy": "2023-03-12T19:48:00.052965Z", - "iopub.status.idle": "2023-03-12T19:48:00.056985Z", - "shell.execute_reply": "2023-03-12T19:48:00.056530Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.053133Z" - }, "tags": [] }, "outputs": [ @@ -906,7 +919,7 @@ " 'translation_en_to_ro']" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -918,16 +931,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "ebf97e9c-d1e1-4908-8a3a-bad24ee3a34f", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.057929Z", - "iopub.status.busy": "2023-03-12T19:48:00.057694Z", - "iopub.status.idle": "2023-03-12T19:48:00.399891Z", - "shell.execute_reply": "2023-03-12T19:48:00.399417Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.057914Z" - }, "tags": [] }, "outputs": [ @@ -937,7 +943,7 @@ "[{'translation_text': \"Qu'est-ce qu'un modèle de séquence à séquence?\"}]" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -967,26 +973,26 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "59df852f-e9bb-4f96-91d9-23f4e4577ea0", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.400753Z", - "iopub.status.busy": "2023-03-12T19:48:00.400401Z", - "iopub.status.idle": "2023-03-12T19:48:04.459054Z", - "shell.execute_reply": "2023-03-12T19:48:04.458437Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.400729Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ "[{'label': 'nl', 'score': 0.994126558303833}]" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1018,19 +1024,19 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "87f633c0-3334-4ef0-942a-5bdec807568d", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:04.459956Z", - "iopub.status.busy": "2023-03-12T19:48:04.459670Z", - "iopub.status.idle": "2023-03-12T19:48:07.863986Z", - "shell.execute_reply": "2023-03-12T19:48:07.863615Z", - "shell.execute_reply.started": "2023-03-12T19:48:04.459935Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1070,6 +1076,14 @@ "- Notebook: [Post Training Quantization of a question-answering model](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)\n", "- Examples: [Quantization Aware Training examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0606fb02-2367-4573-8461-7fc8c065ece6", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1088,7 +1102,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/notebooks/openvino/phi-2_on_mtl.ipynb b/notebooks/openvino/phi-2_on_mtl.ipynb new file mode 100644 index 0000000000..88f0387f05 --- /dev/null +++ b/notebooks/openvino/phi-2_on_mtl.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aeb16663-be53-4260-b62d-44611b6771ec", + "metadata": {}, + "source": [ + "# Chat and Code with Phi-2 with OpenVINO and 🤗 Optimum on Intel Meteor Lake iGPU\n", + "In this notebook we will show how to export and apply weight only quantization on Phi-2 to 4 bits.\n", + "Then using the quantized model we will show how to generate code completions with the model running on Intel Meteor Lake iGPU presenting a good experience of running GenAI locally on Intel PC marking the start of the AIPC Era!\n", + "Then we will show how to talk with Phi-2 in a ChatBot demo running completely locally on your Laptop!\n", + "\n", + "[Phi-2](https://huggingface.co/microsoft/phi-2) is a 2.7 billion-parameter language model trained by Microsoft. Microsoft in the model's release [blog post](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) states that Phi-2:\n", + "> demonstrates outstanding reasoning and language understanding capabilities, showcasing state-of-the-art performance among base language models with less than 13 billion parameters. On complex benchmarks Phi-2 matches or outperforms models up to 25x larger, thanks to new innovations in model scaling and training data curation." + ] + }, + { + "cell_type": "markdown", + "id": "03cb49cf-bc6f-4702-a61f-227b352404cb", + "metadata": {}, + "source": [ + "## Install dependencies\n", + "Make sure you have the latest GPU drivers installed on your machine: https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html.\n", + "\n", + "We will start by installing the dependencies, that can be done by uncommenting the following cell and run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96d8203c-34c9-41a2-95bd-3891533840a1", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install optimum[openvino,nncf] torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5980ce40-0be1-48c1-941a-92c484d4da31", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from transformers import AutoTokenizer\n", + "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig" + ] + }, + { + "cell_type": "markdown", + "id": "48b81857-a095-43a3-8c8d-4c880b743a6e", + "metadata": {}, + "source": [ + "## Configuration\n", + "Here we will configure which model to load and other attributes. We will explain everything 😄\n", + "* `model_name`: the name or path of the model we want to export and quantize, can be either on the 🤗 Hub or a local directory on your laptop.\n", + "* `save_name`: directory where the exported & quantized model will be saved.\n", + "* `precision`: the compute data type we will use for inference of the model, can be either `f32` or `f16`. We use FP32 precision due to Phi-2 overflow issues in FP16.\n", + "* `quantization_config`: here we set the attributes for the weight only quantization algorithm:\n", + " * `bits`: number of bits to use for quantization, can be either `8` or `4`.\n", + " * `sym`: whether to use symmetric quantization or not, can be either `True` or `False`.\n", + " * `group_size`: number of weights to group together for quantization. We use groups of 128 to ensure no accuracy degradation.\n", + " * `ratio`: the ratio of the model to quantize to #`bits`. The rest will be quantize to the default bits number, `8`.\n", + "* `device`: the device to use for inference, can be either `cpu` or `gpu`.\n", + "* `stateful`: Optimize model by setting the KV cache as part of the models state instead of as an input\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800cd7a3-a21d-4a0a-9d73-2a2d08646f99", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = 'microsoft/phi-2'\n", + "save_name = './phi-2-woq4'\n", + "precision = 'f32'\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=4,\n", + " sym=False,\n", + " group_size=128,\n", + " ratio=0.8,\n", + ")\n", + "device = 'gpu'" + ] + }, + { + "cell_type": "markdown", + "id": "1f398868-93d7-4c2d-9591-9bac8e9b701c", + "metadata": {}, + "source": [ + "With this configuration we expect the model size to reduce to around to 1.62GB: $0.8 \\times 2.7{\\times}10^3 \\times \\frac{1}{2}\\text{B} + 0.2 * 2.7{\\times}10^3 \\times 1\\text{B} = 1.62{\\times}10^3\\text{B} = 1.62\\text{GB}$" + ] + }, + { + "cell_type": "markdown", + "id": "d994997d-344c-4d6c-ab08-f78ecb7f56ec", + "metadata": {}, + "source": [ + "## Export & quantize\n", + "OpenVINO together with 🤗 Optimum enables you to load, export and quantize a model in a single `from_pretrained` call making the process as simple as possible.\n", + "Then, we will save the exported & quantized model locally on our laptop. If the model was already exported and saved before we will load the locally saved model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a308c6-27e7-4926-8ac4-4fa0c1ca68d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load kwargs\n", + "load_kwargs = {\n", + " 'device': device,\n", + " 'ov_config': {\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"INFERENCE_PRECISION_HINT\": precision,\n", + " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", + " },\n", + " 'compile': False,\n", + " 'quantization_config': quantization_config\n", + "}\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Load tokenizer to be used with the model\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name if not saved else save_name)\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "# TODO Optional: export to huggingface/hub\n", + "\n", + "model_size = os.stat(os.path.join(save_name, 'openvino_model.bin')).st_size / 1024 ** 3\n", + "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')" + ] + }, + { + "cell_type": "markdown", + "id": "592e118d-e8bb-491f-92b2-d0418e19158c", + "metadata": {}, + "source": [ + "We can see the model size was reduced to 1.7GB as expected. After loading the model we can switch the model between devices using `model.to('gpu')` for example.\n", + "After we have finished to configure everything, we can compile the model by calling `model.compile()` and the model will be ready for usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cef4dc0-191e-4755-a639-c3e8adbd18a2", + "metadata": {}, + "outputs": [], + "source": [ + "model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "dd3c467e-3bbb-4265-9075-1c6688af2f92", + "metadata": {}, + "source": [ + "## Generate using the exported model\n", + "We will now show an example where we will use our quantized Phi-2 to generate code in Python. \n", + "Phi-2 knows how to do code completions where the model is given a function's signature and its docstring and the model will generate the implementation of the function.\n", + "\n", + "In our example we have taken one of the samples from the test set of HumanEval dataset. \n", + "HumanEval is a code completion dataset used to train and benchmark models on code completion in Python. \n", + "Phi-2 has scored a remarkable result on the HumanEval dataset and is an excellent model to use for code completions.\n", + "\n", + "Note: the first time you run the model might take more time due to loading and compilation overheads of the first inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b4ea738-7db5-490e-9338-d6420b77796c", + "metadata": {}, + "outputs": [], + "source": [ + "sample = \"\"\"from typing import List\n", + "\n", + "\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\n", + " given threshold.\n", + " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n", + " False\n", + " >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n", + " True\n", + " \\\"\\\"\\\"\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14ffe7f9-7d93-4a49-95d8-5f2a4e400cfe", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TextStreamer\n", + "\n", + "# Tokenize the sample\n", + "inputs = tokenizer([sample], return_tensors='pt')\n", + "\n", + "# Call generate on the inputs\n", + "out = model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3f8aa25c-de59-4e79-9a1f-c03ec76d206a", + "metadata": {}, + "source": [ + "## Chatbot demo\n", + "We will continue to build a chatbot demo running with Gradio using the model we just exported and quantized.\n", + "The chatbot will be rather simple where the user will input a message and the model will reply to the user by generating text using the entire chat history as the input to the model.\n", + "\n", + "A lot of models that were trained for the chatbot use case have been trained with special tokens to tell the model who is the current speaker and with a special system message. \n", + "Phi-2 wasn't trained specifically for the chatbot use case and doesn't have any special tokens either, however, it has seen chats in the training data and therefore is suited for that use case.\n", + "\n", + "The chat template we will use is rather simple:\n", + "```\n", + "User: \n", + "Assistant: \n", + "User: \n", + "...\n", + "```\n", + "\n", + "We will start by writing the core function of the chatbot that receives the entire history of the chat and generates the assistant's response.\n", + "To support this core function we will build a few assistant functions to prepare the input for the model and to stop generation in time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e81d125-ff47-4122-853d-11a2763db146", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from threading import Thread\n", + "\n", + "from transformers import (\n", + " TextIteratorStreamer,\n", + " StoppingCriteria,\n", + " StoppingCriteriaList,\n", + " GenerationConfig,\n", + ")\n", + "\n", + "\n", + "# Copied and modified from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/generation.py#L13\n", + "class SuffixCriteria(StoppingCriteria):\n", + " def __init__(self, start_length, eof_strings, tokenizer, check_fn=None):\n", + " self.start_length = start_length\n", + " self.eof_strings = eof_strings\n", + " self.tokenizer = tokenizer\n", + " if check_fn is None:\n", + " check_fn = lambda decoded_generation: any(\n", + " [decoded_generation.endswith(stop_string) for stop_string in self.eof_strings]\n", + " )\n", + " self.check_fn = check_fn\n", + "\n", + " def __call__(self, input_ids, scores, **kwargs):\n", + " \"\"\"Returns True if generated sequence ends with any of the stop strings\"\"\"\n", + " decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])\n", + " return all([self.check_fn(decoded_generation) for decoded_generation in decoded_generations])\n", + "\n", + "\n", + "def is_partial_stop(output, stop_str):\n", + " \"\"\"Check whether the output contains a partial stop str.\"\"\"\n", + " for i in range(0, min(len(output), len(stop_str))):\n", + " if stop_str.startswith(output[-i:]):\n", + " return True\n", + " return False\n", + "\n", + "\n", + "\n", + "# Set the chat template to the tokenizer. The chat template implements the simple template of\n", + "# User: content\n", + "# Assistant: content\n", + "# ...\n", + "# Read more about chat templates here https://huggingface.co/docs/transformers/main/en/chat_templating\n", + "tokenizer.chat_template = \"{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}\"\n", + "\n", + "\n", + "def prepare_history_for_model(history):\n", + " \"\"\"\n", + " Converts the history to a tokenized prompt in the format expected by the model.\n", + " Params:\n", + " history: dialogue history\n", + " Returns:\n", + " Tokenized prompt\n", + " \"\"\"\n", + " messages = []\n", + " for idx, (user_msg, model_msg) in enumerate(history):\n", + " # skip the last assistant message if its empty, the tokenizer will do the formating\n", + " if idx == len(history) - 1 and not model_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " break\n", + " if user_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " if model_msg:\n", + " messages.append({'role': 'Assistant', 'content': model_msg})\n", + " input_token = tokenizer.apply_chat_template(\n", + " messages,\n", + " add_generation_prompt=True,\n", + " tokenize=True,\n", + " return_tensors=\"pt\",\n", + " return_dict=True\n", + " )\n", + " return input_token\n", + "\n", + "\n", + "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty):\n", + " \"\"\"\n", + " Generates the assistant's reponse given the chatbot history and generation parameters\n", + "\n", + " Params:\n", + " history: conversation history formated in pairs of user and assistant messages `[user_message, assistant_message]`\n", + " temperature: parameter for control the level of creativity in AI-generated text.\n", + " By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.\n", + " max_new_tokens: The maximum number of tokens we allow the model to generate as a response.\n", + " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", + " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " Yields:\n", + " Updated history and generation status.\n", + " \"\"\"\n", + " start = time.perf_counter()\n", + " # Construct the input message string for the model by concatenating the current system message and conversation history\n", + " # Tokenize the messages string\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + " # truncate input in case it is too long.\n", + " # TODO improve this\n", + " if input_length > 2000:\n", + " history = [history[-1]]\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + "\n", + " prompt_char = '▌'\n", + " history[-1][1] = prompt_char\n", + " yield (history, 'Status: Generating...')\n", + " \n", + " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", + "\n", + " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", + " stop_str = f'\\nUser:'\n", + " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, [stop_str], tokenizer)])\n", + " # Prepare input for generate\n", + " generation_config = GenerationConfig(\n", + " max_new_tokens=max_new_tokens,\n", + " do_sample=temperature > 0.0,\n", + " temperature=temperature if temperature > 0.0 else 1.0,\n", + " repetition_penalty=repetition_penalty,\n", + " top_p=top_p,\n", + " eos_token_id=[tokenizer.eos_token_id],\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " )\n", + " generate_kwargs = dict(\n", + " streamer=streamer,\n", + " generation_config=generation_config,\n", + " stopping_criteria=stopping_criteria,\n", + " ) | inputs\n", + "\n", + " t1 = Thread(target=model.generate, kwargs=generate_kwargs)\n", + " t1.start()\n", + "\n", + " # Initialize an empty string to store the generated text.\n", + " partial_text = \"\"\n", + " for new_text in streamer:\n", + " partial_text += new_text\n", + " history[-1][1] = partial_text + prompt_char\n", + " # We don't yield the generated text until we are sure it is not the stop string\n", + " pos = partial_text.rfind(stop_str)\n", + " if pos != -1:\n", + " partial_text = partial_text[:pos]\n", + " break\n", + " elif is_partial_stop(partial_text, stop_str):\n", + " continue\n", + " yield (history, 'Status: Generating...')\n", + " history[-1][1] = partial_text\n", + " generation_time = time.perf_counter() - start\n", + " yield (history, f'Generation time: {generation_time:.2f} sec')" + ] + }, + { + "cell_type": "markdown", + "id": "29fe1ae5-9929-4789-9293-612b2062e2a8", + "metadata": {}, + "source": [ + "Next we will create the actual demo using Gradio. The layout will be very simple, a chatbot window followed by a text prompt and some controls.\n", + "We will also include sliders to adjust generation parameters like temperature and length of response we allow the model to generate.\n", + "\n", + "To install Gradio dependency, please uncomment the following cell and run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b61a9a9f", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install gradio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ae1aa4e-3539-49a1-8f32-62b818ee1002", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "\n", + "\n", + "EXAMPLES = [\n", + " [\"What is OpenVINO?\"],\n", + " [\"Can you explain to me briefly what is Python programming language?\"],\n", + " [\"Explain the plot of Cinderella in a sentence.\"],\n", + " [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n", + " [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n", + "]\n", + "\n", + "\n", + "def add_user_text(message, history):\n", + " \"\"\"\n", + " Add user's message to chatbot history\n", + "\n", + " Params:\n", + " message: current user message\n", + " history: conversation history\n", + " Returns:\n", + " Updated history, clears user message and status\n", + " \"\"\"\n", + " # Append current user message to history with a blank assistant message which will be generated by the model\n", + " history.append([message, None])\n", + " return ('', history)\n", + "\n", + "\n", + "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", + " gr.Markdown('

Chat with Phi-2 on Meteor Lake iGPU

')\n", + " chatbot = gr.Chatbot()\n", + " with gr.Row():\n", + " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", + " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=25)\n", + " with gr.Row():\n", + " submit = gr.Button(\"Submit\", variant='primary')\n", + " clear = gr.Button(\"Clear\")\n", + " with gr.Accordion(\"Advanced Options:\", open=False):\n", + " with gr.Row():\n", + " with gr.Column():\n", + " temperature = gr.Slider(\n", + " label=\"Temperature\",\n", + " value=0.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " max_new_tokens = gr.Slider(\n", + " label=\"Max new tokens\",\n", + " value=128,\n", + " minimum=0,\n", + " maximum=512,\n", + " step=32,\n", + " interactive=True,\n", + " )\n", + " with gr.Column():\n", + " top_p = gr.Slider(\n", + " label=\"Top-p (nucleus sampling)\",\n", + " value=1.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " repetition_penalty = gr.Slider(\n", + " label=\"Repetition penalty\",\n", + " value=1.0,\n", + " minimum=1.0,\n", + " maximum=2.0,\n", + " step=0.1,\n", + " interactive=True,\n", + " )\n", + " gr.Examples(\n", + " EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n", + " )\n", + "\n", + " # Sets generate function to be triggered when the user submit a new message\n", + " gr.on(\n", + " triggers=[submit.click, msg.submit],\n", + " fn=add_user_text,\n", + " inputs=[msg, chatbot],\n", + " outputs=[msg, chatbot],\n", + " queue=False,\n", + " ).then(\n", + " fn=generate,\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty],\n", + " outputs=[chatbot, status],\n", + " concurrency_limit=1,\n", + " queue=True\n", + " )\n", + " \n", + " clear.click(fn=lambda: (None, 'Status: Idle'), inputs=None, outputs=[chatbot, status], queue=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1d1baf09-26f1-40ab-896c-3468b5e89fec", + "metadata": {}, + "source": [ + "That's it, all that is left is to start the demo!\n", + "\n", + "When you're done you can use `demo.close()` to close the demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b73962d-f977-45b7-be3a-32b65e546737", + "metadata": {}, + "outputs": [], + "source": [ + "demo.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e26a0bc-6a78-4185-8b0c-7e9450ba5868", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# demo.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index 782a48ff45..ba4a84ca38 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -846,9 +846,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "FP32 model size: 436.07 MB\n", - "INT8 model size: 182.41 MB\n", - "INT8 size decrease: 2.39x\n" + "FP32 model size: 436.50 MB\n", + "INT8 model size: 181.84 MB\n", + "INT8 size decrease: 2.4x\n" ] } ], @@ -858,7 +858,7 @@ " Return OpenVINO or PyTorch model size in Mb.\n", " Arguments:\n", " model_folder:\n", - " Directory containing a pytorch_model.bin for a PyTorch model, and an openvino_model.xml/.bin for an OpenVINO model.\n", + " Directory containing a model.safetensors for a PyTorch model, and an openvino_model.xml/.bin for an OpenVINO model.\n", " framework:\n", " Define whether the model is a PyTorch or an OpenVINO model.\n", " \"\"\"\n", @@ -866,7 +866,7 @@ " model_path = Path(model_folder) / \"openvino_model.xml\"\n", " model_size = model_path.stat().st_size + model_path.with_suffix(\".bin\").stat().st_size\n", " elif framework.lower() == \"pytorch\":\n", - " model_path = Path(model_folder) / \"pytorch_model.bin\"\n", + " model_path = Path(model_folder) / \"model.safetensors\"\n", " model_size = model_path.stat().st_size\n", " model_size /= 1000 * 1000\n", " return model_size\n", diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index 9664f6ae6d..94ea4f103b 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import optimum.exporters.openvino.model_configs + from .__main__ import main_export from .convert import export, export_from_model, export_models, export_pytorch_via_onnx from .stateful import ensure_stateful_is_available, patch_stateful diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 1c695e2f19..02268a3604 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -58,7 +58,7 @@ def main_export( local_files_only: bool = False, use_auth_token: Optional[Union[bool, str]] = None, model_kwargs: Optional[Dict[str, Any]] = None, - custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, + custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None, @@ -112,11 +112,11 @@ def main_export( when running `transformers-cli login` (stored in `~/.huggingface`). model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`): Experimental usage: keyword arguments to pass to the model during - the export. This argument should be used along the `custom_onnx_configs` argument + the export. This argument should be used along the `custom_export_configs` argument in case, for example, the model inputs/outputs are changed (for example, if `model_kwargs={"output_attentions": True}` is passed). - custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`): - Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model). + custom_export_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`): + Experimental usage: override the default export config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model). fn_get_submodels (`Optional[Callable]`, defaults to `None`): Experimental usage: Override the default submodels that are used at the export. This is especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success. @@ -134,7 +134,7 @@ def main_export( ```python >>> from optimum.exporters.openvino import main_export - >>> main_export("gpt2", output="gpt2_onnx/") + >>> main_export("gpt2", output="gpt2_ov/") ``` """ @@ -206,14 +206,14 @@ def main_export( if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( - model_type, exporter="onnx", library_name=library_name + model_type, exporter="openvino", library_name=library_name ): if original_task == "auto": autodetected_message = " (auto-detected)" else: autodetected_message = "" model_tasks = TasksManager.get_supported_tasks_for_model_type( - model_type, exporter="onnx", library_name=library_name + model_type, exporter="openvino", library_name=library_name ) raise ValueError( f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}." @@ -288,7 +288,7 @@ class StoreAttr(object): not custom_architecture and library_name != "diffusers" and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx", library_name=library_name) + in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="openvino", library_name=library_name) ): # Make -with-past the default if --task was not explicitely specified if original_task == "auto": @@ -319,7 +319,7 @@ class StoreAttr(object): ov_config=ov_config, stateful=stateful, model_kwargs=model_kwargs, - custom_onnx_configs=custom_onnx_configs, + custom_export_configs=custom_export_configs, fn_get_submodels=fn_get_submodels, preprocessors=preprocessors, device=device, diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 5353912d48..8c49994874 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -32,10 +32,11 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx +from optimum.exporters.utils import _get_submodels_and_export_configs from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available from optimum.utils.save_utils import maybe_save_preprocessors -from ...intel.utils.import_utils import is_nncf_available, is_optimum_version +from ...intel.utils.import_utils import is_nncf_available from .model_patcher import patch_model_with_bettertransformer from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful from .utils import ( @@ -48,13 +49,6 @@ ) -if is_optimum_version(">=", "1.16.99"): - from optimum.exporters.onnx.utils import _get_submodels_and_onnx_configs - -else: - from optimum.exporters.onnx.__main__ import _get_submodels_and_onnx_configs - - UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast) @@ -299,7 +293,7 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") output = Path(output) - if stateful: + if ensure_export_task_support_stateful(config.task): # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect # both of them are applied to demonstrate the best performance. # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation. @@ -418,7 +412,7 @@ def ts_patched_forward(*args, **kwargs): def export_models( - models_and_onnx_configs: Dict[ + models_and_export_configs: Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"] ], output_dir: Path, @@ -434,7 +428,7 @@ def export_models( Export the models to OpenVINO IR format Args: - models_and_onnx_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]): + models_and_export_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]): output_dir (Path): output directory for saving models opset (Optional[int], optional, Default to None): ONNX export opset output_names (Optional[List[str]], optional, Defaults to None): model output names @@ -459,20 +453,20 @@ def export_models( outputs = [] - if output_names is not None and len(output_names) != len(models_and_onnx_configs): + if output_names is not None and len(output_names) != len(models_and_export_configs): raise ValueError( - f"Provided custom names {output_names} for the export of {len(models_and_onnx_configs)} models. Please provide the same number of names as models to export." + f"Provided custom names {output_names} for the export of {len(models_and_export_configs)} models. Please provide the same number of names as models to export." ) - for i, model_name in enumerate(models_and_onnx_configs.keys()): - submodel, sub_onnx_config = models_and_onnx_configs[model_name] + for i, model_name in enumerate(models_and_export_configs.keys()): + submodel, sub_export_config = models_and_export_configs[model_name] output_name = output_names[i] if output_names is not None else Path(model_name + ".xml") output_path = output_dir / output_name output_path.parent.mkdir(parents=True, exist_ok=True) outputs.append( export( model=submodel, - config=sub_onnx_config, + config=sub_export_config, output=output_path, opset=opset, device=device, @@ -495,7 +489,7 @@ def export_from_model( stateful: bool = True, opset: Optional[int] = None, model_kwargs: Optional[Dict[str, Any]] = None, - custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, + custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, preprocessors: List = None, device: str = "cpu", @@ -524,14 +518,14 @@ def export_from_model( task = TasksManager._infer_task_from_model_or_model_class(model=model) except (ValueError, KeyError) as e: raise RuntimeError( - f"The model task could not be automatically inferred in `onnx_export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) if ( not custom_architecture and library_name != "diffusers" and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx", library_name=library_name) + in TasksManager.get_supported_tasks_for_model_type(model_type, "openvino", library_name=library_name) ): # -with-past is the default. task = task + "-with-past" @@ -541,9 +535,9 @@ def export_from_model( stateful = stateful and ensure_export_task_support_stateful(task) # TODO: support onnx_config.py in the model repo - if custom_architecture and custom_onnx_configs is None: + if custom_architecture and custom_export_configs is None: raise ValueError( - f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." ) if task.startswith("text-generation") and model.config.is_encoder_decoder: @@ -569,11 +563,11 @@ def export_from_model( kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] ) - onnx_config, models_and_onnx_configs = _get_submodels_and_onnx_configs( + export_config, models_and_export_configs = _get_submodels_and_export_configs( model=model, task=task, monolith=False, - custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, + custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, custom_architecture=custom_architecture, fn_get_submodels=fn_get_submodels, preprocessors=preprocessors, @@ -581,6 +575,7 @@ def export_from_model( model_kwargs=model_kwargs, _variant="default", legacy=False, + exporter="openvino", ) if ov_config is None: @@ -612,18 +607,18 @@ def export_from_model( model_name_or_path = model.config._name_or_path maybe_save_preprocessors(model_name_or_path, output) - files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_onnx_configs.keys()] + files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] else: # save the subcomponent configuration - for model_name in models_and_onnx_configs: - subcomponent = models_and_onnx_configs[model_name][0] + for model_name in models_and_export_configs: + subcomponent = models_and_export_configs[model_name][0] if hasattr(subcomponent, "save_config"): subcomponent.save_config(output / model_name) elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): subcomponent.config.save_pretrained(output / model_name) - files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs] + files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_export_configs] # Saving the additional components needed to perform inference. model.scheduler.save_pretrained(output.joinpath("scheduler")) @@ -643,7 +638,7 @@ def export_from_model( model.save_config(output) export_models( - models_and_onnx_configs=models_and_onnx_configs, + models_and_export_configs=models_and_export_configs, output_dir=output, output_names=files_subpaths, input_shapes=input_shapes, diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py new file mode 100644 index 0000000000..b6536512b1 --- /dev/null +++ b/optimum/exporters/openvino/model_configs.py @@ -0,0 +1,391 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +from packaging import version +from transformers.utils import is_tf_available + +from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig +from optimum.exporters.onnx.model_configs import GemmaOnnxConfig +from optimum.exporters.tasks import TasksManager +from optimum.utils import DEFAULT_DUMMY_SHAPES +from optimum.utils.input_generators import ( + DummyInputGenerator, + DummyPastKeyValuesGenerator, + DummyTextInputGenerator, + MistralDummyPastKeyValuesGenerator, +) +from optimum.utils.normalized_config import NormalizedTextConfig + +from .model_patcher import ( + BaichuanModelPatcher, + ChatGLMModelPatcher, + GemmaModelPatcher, + MixtralModelPatcher, + QwenModelPatcher, +) + + +def init_model_configs(): + supported_model_types = [ + "_SUPPORTED_MODEL_TYPE", + "_DIFFUSERS_SUPPORTED_MODEL_TYPE", + "_TIMM_SUPPORTED_MODEL_TYPE", + "_SENTENCE_TRANSFORMERS_SUPPORTED_MODEL_TYPE", + ] + + for supported_models_config in supported_model_types: + supported_models = getattr(TasksManager, supported_models_config) + for model, export_configs in supported_models.items(): + if "onnx" not in export_configs: + continue + onnx_config = export_configs["onnx"] + supported_models[model]["openvino"] = deepcopy(onnx_config) + + setattr(TasksManager, supported_models_config, supported_models) + + +init_model_configs() + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + + from optimum.exporters.onnx.model_patcher import ModelPatcher + + if is_tf_available(): + from transformers.modeling_tf_utils import TFPreTrainedModel + + +register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) + + +@register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") +class BaichaunOpenVINOConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return BaichuanModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("qwen2", *["text-generation", "text-generation-with-past"], library_name="transformers") +class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("stablelm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class StableLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.multi_query_group_num = normalized_config.multi_query_group_num + self.head_dim = normalized_config.kv_channels + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_shape = ( + self.sequence_length, + self.batch_size, + self.multi_query_group_num, + self.head_dim, + ) + past_value_shape = ( + self.sequence_length, + self.batch_size, + self.multi_query_group_num, + self.head_dim, + ) + return [ + ( + self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("chatglm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class ChatGLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(vocab_size="padded_vocab_size", num_layers="num_layers") + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] + if self.use_past_in_inputs and self.use_cache_branch is not False: + input_names.append("past_key_values") + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + # refer to https://github.com/huggingface/optimum/pull/764 + if ( + self.use_past_in_inputs + and self.PAD_ATTENTION_MASK_TO_PAST + and self.use_cache_branch is not False + and "attention_mask" in dummy_inputs + ): + # Obtain the past sequence length from the value instead of the key (Bloom). ChatGLM has seq_len in 0 dim instead of -2 + past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[0] + + dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( + dummy_inputs["attention_mask"], + desired_length=past_present_length, + dim=1, + dtype=dummy_inputs["attention_mask"].dtype, + ) + + return dummy_inputs + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + """ + Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. + + Args: + inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. + direction (`str`): + either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the + output mapping, this is important for axes naming. + """ + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + present_lenght" + name = "present" + + for i in range(self._normalized_config.num_layers): + inputs_or_outputs[f"{name}.{i}.key"] = {1: "batch_size", 0: decoder_sequence_name} + inputs_or_outputs[f"{name}.{i}.value"] = {1: "batch_size", 0: decoder_sequence_name} + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return ChatGLMModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("mixtral", *["text-generation", "text-generation-with-past"], library_name="transformers") +class MixtralOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 + MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") + + # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 + DEFAULT_ONNX_OPSET = 14 + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return MixtralModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "gemma", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class GemmaOpenVINOConfig(GemmaOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return GemmaModelPatcher(self, model, model_kwargs=model_kwargs) + + +class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.kv_channels = normalized_config.kv_channels + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) + past_value_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) + return [ + ( + self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("qwen", *["text-generation", "text-generation-with-past"]) +class QwenOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" + ) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, QwenDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = QwenDummyPastKeyValuesGenerator + no_position_ids = False + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] + if self.use_past_in_inputs and self.use_cache_branch is not False: + input_names.append("past_key_values") + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + # refer to https://github.com/huggingface/optimum/pull/764 + if ( + self.use_past_in_inputs + and self.PAD_ATTENTION_MASK_TO_PAST + and self.use_cache_branch is not False + and "attention_mask" in dummy_inputs + ): + # Obtain the past sequence length from the value instead of the key (Bloom). Qwen has seq_len in 1 dim instead of -2 + past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[1] + + dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( + dummy_inputs["attention_mask"], + desired_length=past_present_length, + dim=1, + dtype=dummy_inputs["attention_mask"].dtype, + ) + + return dummy_inputs + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + """ + Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. + + Args: + inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. + direction (`str`): + either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the + output mapping, this is important for axes naming. + """ + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + 1" + name = "present" + + for i in range(self._normalized_config.num_layers): + inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 1: decoder_sequence_name} + inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 1: decoder_sequence_name} + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return QwenModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 91dc48df05..a31287d84f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,15 @@ # limitations under the License. import logging as log +import types +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +import torch +import torch.nn.functional as F +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.utils import is_tf_available + +from optimum.exporters.onnx.model_patcher import DecoderModelPatcher from optimum.intel.utils.import_utils import ( _openvino_version, _torch_version, @@ -24,6 +32,15 @@ ) +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + + from optimum.exporters.onnx.config import OnnxConfig + + if is_tf_available(): + from transformers.modeling_tf_utils import TFPreTrainedModel + + def patch_model_with_bettertransformer(model): COLOR_RED = "\033[1;31m" COLOR_RESET = "\033[0m" @@ -43,7 +60,7 @@ def patch_model_with_bettertransformer(model): ) if ( - getattr(model.config, "model_type") in {"gpt_bigcode", "llama"} + getattr(model.config, "model_type") in {"gpt_bigcode", "llama", "gemma"} and is_transformers_version(">=", "4.38") and is_openvino_version("<", "2024.1.0-14612") ): @@ -52,10 +69,11 @@ def patch_model_with_bettertransformer(model): _openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version ) log.warn( - COLOR_RED + f"[WARNING] Stateful models are not supported for Llama and GPTBigCode with Transformers " + COLOR_RED + + f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers " f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: " - "https://docs.openvino.ai/2024/get-started/install-openvino.html. For models that do not need transformers " - "4.38+, it is also an option to downgrade transformers: `pip install transformers==4.37.2`" + COLOR_RESET + "https://docs.openvino.ai/2024/get-started/install-openvino.html. For gpt-bigcode and llama models, " + "it is also an option to downgrade transformers: `pip install transformers==4.37.2`" + COLOR_RESET ) # model already has required SDPA implementation @@ -71,3 +89,425 @@ def patch_model_with_bettertransformer(model): return model return model + + +def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ """ + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + for expert_idx in range(self.num_experts): + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx]) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states, router_logits + + +class MixtralModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _mixtral_sparse_moe_block_forward, layer.block_sparse_moe + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward + + +def _chatglm_transformer_forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt( + batch_size=batch_size, + device=input_ids.device, + dtype=inputs_embeds.dtype, + ) + if attention_mask is not None: + attention_mask = torch.cat( + [ + attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask, + ], + dim=-1, + ) + + if full_attention_mask is None: + if past_key_values is not None: + full_attention_mask = torch.ones( + batch_size, + seq_length, + seq_length, + device=input_ids.device, + dtype=torch.float, + ) * float("-inf") + full_attention_mask.triu_(diagonal=1) + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat( + ( + torch.zeros(batch_size, seq_length, past_length, device=input_ids.device), + full_attention_mask, + ), + dim=-1, + ) + full_attention_mask.unsqueeze_(1) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, + full_attention_mask, + rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor): + mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype) + if query_layer.shape[2] == key_layer.shape[2]: + tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1) + mask.masked_fill_(tmp_mask, float("-inf")) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, attn_mask=mask + ) + return context_layer + + +def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None: + context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer) + else: + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, attention_mask + ) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + + return context_layer + + +class ChatGLMModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + + self.original_chatglm_transformer_forward = model.transformer.forward + + def __enter__(self): + super().__enter__() + self._model.transformer.forward = types.MethodType(_chatglm_transformer_forward, self._model.transformer) + for block in self._model.transformer.encoder.layers: + block.self_attention.core_attention._orig_forward = block.self_attention.core_attention.forward + block.self_attention.core_attention.forward = types.MethodType( + _chatglm2_core_attention_forward, block.self_attention.core_attention + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.transformer.forward = self.original_chatglm_transformer_forward + for block in self._model.transformer.encoder.layers: + block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward + + +class GemmaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # init inv_freq for torchscript tracing + # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108 + for layer in self._model.model.layers: + if layer.self_attn.rotary_emb.inv_freq is None: + rotary_emb = layer.self_attn.rotary_emb + layer.self_attn.rotary_emb.inv_freq = 1.0 / ( + rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) + ) + + +SUPPORT_SDPA = is_torch_version(">", "2.1.0") + + +def _qwen_rotate_half(x): + from einops import rearrange + + x = rearrange(x, "... (j d) -> ... j d", j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def _qwen_apply_rotary_pos_emb(t, freqs): + cos, sin = freqs + rot_dim = freqs[0].shape[-1] + cos, sin = freqs + t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:] + t_ = t_.float() + t_pass_ = t_pass_.float() + t_ = (t_ * cos) + (_qwen_rotate_half(t_) * sin) + return torch.cat((t_, t_pass_), dim=-1).type_as(t) + + +def _qwen_quantize_cache_v(fdata, bits, qmax, qmin): + # b, s, head, h-dim->b, head, s, h-dim + qtype = torch.uint8 + device = fdata.device + shape = fdata.shape + + fdata_cal = torch.flatten(fdata, 2) + fmax = torch.amax(fdata_cal, dim=-1, keepdim=True) + fmin = torch.amin(fdata_cal, dim=-1, keepdim=True) + # Compute params + if qmax.device != fmax.device: + qmax = qmax.to(device) + qmin = qmin.to(device) + scale = (fmax - fmin) / (qmax - qmin) + zero = qmin - fmin / scale + scale = scale.unsqueeze(-1).repeat(1, 1, shape[2], 1).contiguous() + zero = zero.unsqueeze(-1).repeat(1, 1, shape[2], 1).contiguous() + # Quantize + res_data = fdata / scale + zero + qdata = torch.clamp(res_data, qmin, qmax).to(qtype) + return qdata.contiguous(), scale, zero + + +def _qwen_attention_forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None, + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, +): + mixed_x_layer = self.c_attn(hidden_states) + + query, key, value = mixed_x_layer.split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if rotary_pos_emb_list is not None: + cur_len = query.shape[1] + if len(rotary_pos_emb_list) == 1: + rotary_pos_emb = rotary_pos_emb_list[0] + rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] + rotary_pos_emb = (rotary_pos_emb,) * 2 + q_pos_emb, k_pos_emb = rotary_pos_emb + # Slice the pos emb for current inference + query = _qwen_apply_rotary_pos_emb(query, q_pos_emb) + key = _qwen_apply_rotary_pos_emb(key, k_pos_emb) + else: + query_list = [] + key_list = [] + for i, rotary_pos_emb in enumerate(rotary_pos_emb_list): + rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] + rotary_pos_emb = (rotary_pos_emb,) * 2 + q_pos_emb, k_pos_emb = rotary_pos_emb + # Slice the pos emb for current inference + query_list += [_qwen_apply_rotary_pos_emb(query[i : i + 1, :, :], q_pos_emb)] + key_list += [_qwen_apply_rotary_pos_emb(key[i : i + 1, :, :], k_pos_emb)] + query = torch.cat(query_list, dim=0) + key = torch.cat(key_list, dim=0) + + if self.use_cache_quantization: + key = _qwen_quantize_cache_v(key.permute(0, 2, 1, 3), bits=8, qmin=self.cache_qmin, qmax=self.cache_qmax) + value = _qwen_quantize_cache_v(value.permute(0, 2, 1, 3), bits=8, qmin=self.cache_qmin, qmax=self.cache_qmax) + + if layer_past is not None: + past_key, past_value = layer_past[0], layer_past[1] + if self.use_cache_quantization: + # use_cache_quantization: + # present=((q_key,key_scale,key_zero_point), + # (q_value,value_scale,value_zero_point)) + key = ( + torch.cat((past_key[0], key[0]), dim=2), + torch.cat((past_key[1], key[1]), dim=2), + torch.cat((past_key[2], key[2]), dim=2), + ) + value = ( + torch.cat((past_value[0], value[0]), dim=2), + torch.cat((past_value[1], value[1]), dim=2), + torch.cat((past_value[2], value[2]), dim=2), + ) + else: + # not use_cache_quantization: + # present=(key,value) + key = torch.cat((past_key, key), dim=1) + value = torch.cat((past_value, value), dim=1) + + if use_cache: + present = (key, value) + else: + present = None + + if self.use_logn_attn and not self.training: + if self.use_cache_quantization: + seq_start = key[0].size(2) - query.size(1) + seq_end = key[0].size(2) + else: + seq_start = key.size(1) - query.size(1) + seq_end = key.size(1) + logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query) + query = query * logn_tensor.expand_as(query) + + if self.use_flash_attn and not self.is_fp32 and query.is_cuda: + q, k, v = query, key, value + attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask) + else: + registered_causal_mask = torch.tril( + torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device) + ).view(1, 1, key.size(1), key.size(1)) + query = query.permute(0, 2, 1, 3) + if not self.use_cache_quantization: + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + + if not self.use_cache_quantization and SUPPORT_SDPA: + causal_mask = registered_causal_mask[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] + if attention_mask is not None: + attention_mask = attention_mask.expand(-1, -1, causal_mask.size(2), -1).masked_fill( + ~causal_mask, torch.finfo(query.dtype).min + ) + else: + attention_mask = causal_mask + attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2) + attn_weight = None + else: + attn_output, attn_weight = self._attn(query, key, value, registered_causal_mask, attention_mask, head_mask) + context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) + + attn_output = self.c_proj(context_layer) + + outputs = (attn_output, present) + if output_attentions: + if self.use_flash_attn and not self.is_fp32: + raise ValueError("Cannot output attentions while using flash-attn") + else: + outputs += (attn_weight,) + + return outputs + + +class QwenModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + + self.original_fp16 = model.config.fp16 + self.original_bf16 = model.config.bf16 + model.config.bf16 = False + model.config.fp16 = False + if self.original_fp16 or self.original_bf16: + model.to(torch.float32) + model.transformer.rotary_emb(2048) + + def __enter__(self): + super().__enter__() + for block in self._model.transformer.h: + block.attn._orig_forward = block.attn.forward + block.attn.forward = types.MethodType(_qwen_attention_forward, block.attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for block in self._model.transformer.h: + block.attn.forward = block.attn._orig_forward + self._model.config.bf16 = self.original_bf16 + self._model.config.fp16 = self.original_fp16 + + +class BaichuanModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + # model has first inference buffers initialization + if self._model.lm_head.first_flag: + self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 53aa05bc5a..832c132615 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -316,7 +316,9 @@ def _reshape( shapes[inputs][0] = -1 input_name = inputs.get_any_name() if input_name.startswith("past_key_values"): - if len(inputs.partial_shape) == 3 and input_name.endswith("value"): + if ( + len(inputs.partial_shape) == 3 and input_name.endswith("value") + ) or self.config.model_type == "chatglm": shapes[inputs][1] = -1 else: shapes[inputs][2] = -1 diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index c46f29092b..2022a495d8 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -350,7 +350,7 @@ def _quantize_torchmodel( model_type = self.model.config.model_type.replace("_", "-") onnx_config_class = TasksManager.get_exporter_config_constructor( - exporter="onnx", + exporter="openvino", model=self.model, task=self.task, model_type=model_type, diff --git a/setup.py b/setup.py index a59721450f..2a125597df 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,6 @@ +import os import re +import subprocess from setuptools import find_namespace_packages, setup @@ -8,13 +10,26 @@ filepath = "optimum/intel/version.py" with open(filepath) as version_file: (__version__,) = re.findall('__version__ = "(.*)"', version_file.read()) + if __version__.endswith(".dev0"): + dev_version_id = "" + try: + repo_root = os.path.dirname(os.path.realpath(__file__)) + dev_version_id = ( + subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], cwd=repo_root) # nosec + .strip() + .decode() + ) + dev_version_id = "+" + dev_version_id + except subprocess.CalledProcessError: + pass + __version__ = __version__ + dev_version_id except Exception as error: assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) INSTALL_REQUIRE = [ "torch>=1.11", - "optimum~=1.17", "transformers>=4.36.0,<4.39.0", + "optimum @ git+https://github.com/huggingface/optimum.git#egg=optimum", "datasets>=1.4.0", "sentencepiece", "scipy", @@ -38,6 +53,8 @@ "intel-extension-for-transformers>=1.3", "peft", "auto-gptq", + "transformers_stream_generator", + "einops", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2188b7061f..9df6c73214 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -28,6 +28,7 @@ from parameterized import parameterized from PIL import Image from transformers import ( + AutoConfig, AutoFeatureExtractor, AutoModel, AutoModelForAudioClassification, @@ -52,7 +53,6 @@ from transformers.onnx.utils import get_preprocessor from utils_tests import MODEL_NAMES -from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, @@ -473,73 +473,101 @@ def test_pipeline(self, model_arch): class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", + "baichuan2", "gpt_bigcode", "blenderbot", "blenderbot-small", "bloom", + "chatglm", "codegen", # "data2vec-text", # TODO : enable when enabled in exporters + "gemma", "gpt2", "gpt_neo", "gpt_neox", "llama", # "llama_gptq", "marian", + "minicpm", "mistral", + "mixtral", "mpt", "opt", "pegasus", + "qwen", + "qwen2", + "stablelm", ) GENERATION_LENGTH = 100 IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] + not_stateful = ["gpt_bigcode"] + if is_openvino_version("<", "2024.0"): + not_stateful.append("mixtral") + + if is_openvino_version("<", "2024.1"): + not_stateful.extend(["llama", "gemma"]) if "gptq" in model_arch: self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM") set_seed(SEED) - ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + + model_kwargs = {} + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) - - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) - tokenizer = AutoTokenizer.from_pretrained(model_id) + self.assertEqual( + ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful + ) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + if model_arch == "qwen": + transformers_model.to(torch.float32) tokens = tokenizer( "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None ) - position_ids = None - if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: - input_shape = tokens["input_ids"].shape - position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ov_outputs = ov_model(**tokens, position_ids=position_ids) + ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) self.assertTrue("past_key_values" in ov_outputs) self.assertIsInstance(ov_outputs.past_key_values, tuple) - - is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL + is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) if is_stateful: self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) - with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4)) del transformers_model del ov_model gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): + model_kwargs = {} model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False) + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + model = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=False, compile=False, **model_kwargs + ) model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") model.half() @@ -556,8 +584,16 @@ def test_pipeline(self, model_arch): def test_multiple_inputs(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False) - tokenizer = AutoTokenizer.from_pretrained(model_id) + if model_arch == "qwen": + self.skipTest("Qwen tokenizer does not support padding") + model_kwargs = {} + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer.pad_token = tokenizer.eos_token texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] tokens = tokenizer(texts, padding=True, return_tensors="pt") diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 97c8a92836..ad3cd03d3d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -22,12 +22,14 @@ "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": "hf-internal-testing/tiny-random-bert", "bart": "hf-internal-testing/tiny-random-bart", + "baichuan2": "katuni4ka/tiny-random-baichuan2", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "chatglm": "katuni4ka/tiny-random-chatglm2", "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel", "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel", @@ -38,6 +40,7 @@ "convnext": "hf-internal-testing/tiny-random-convnext", "distilbert": "hf-internal-testing/tiny-random-distilbert", "electra": "hf-internal-testing/tiny-random-electra", + "gemma": "fxmarty/tiny-random-GemmaForCausalLM", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", @@ -55,7 +58,9 @@ "opt125m": "facebook/opt-125m", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", + "minicpm": "katuni4ka/tiny-random-minicpm", "mistral": "echarlaix/tiny-random-mistral", + "mixtral": "TitanML/tiny-mixtral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -66,6 +71,8 @@ "pegasus": "hf-internal-testing/tiny-random-pegasus", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "qwen": "katuni4ka/tiny-random-qwen", + "qwen2": "Qwen/Qwen1.5-0.5B", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", @@ -76,6 +83,7 @@ "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", "sew": "hf-internal-testing/tiny-random-SEWModel", "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h",