From eb0d42fca6bf771bbf98be0f6b540b76ff5b2096 Mon Sep 17 00:00:00 2001 From: Nir David <124874956+nirda7@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:44:07 +0200 Subject: [PATCH] Add inc fp8 qunatization documentation (#635) --- .../getting_started/gaudi-installation.rst | 2 +- docs/source/index.rst | 1 + docs/source/quantization/inc.rst | 64 +++++++++++++++++++ .../quantization/supported_hardware.rst | 24 ++++++- vllm/platforms/hpu.py | 1 + 5 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 docs/source/quantization/inc.rst diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index ee733afd27578..4457d3be7a12e 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -46,7 +46,7 @@ To verify that the Intel Gaudi software was correctly installed, run: $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed - $ pip list | grep neural # verify that neural_compressor is installed + $ pip list | grep neural # verify that neural_compressor_pt is installed Refer to `System Verification and Final Tests `__ for more details. diff --git a/docs/source/index.rst b/docs/source/index.rst index ebf1361976c5e..7c545a85ac2be 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,6 +121,7 @@ Documentation quantization/auto_awq quantization/bnb quantization/gguf + quantization/inc quantization/int8 quantization/fp8 quantization/fp8_e5m2_kvcache diff --git a/docs/source/quantization/inc.rst b/docs/source/quantization/inc.rst new file mode 100644 index 0000000000000..ad7e21af54c40 --- /dev/null +++ b/docs/source/quantization/inc.rst @@ -0,0 +1,64 @@ +.. _INC: + +FP8 INC +======= + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators. +Currently, quantization is validated only in Llama models. + +Intel Gaudi supports quantization of various modules and functions, including, but not limited to ``Linear``, ``KVCache``, ``Matmul`` and ``Softmax``. For more information, please refer to: +`Supported Modules\\Supported Functions\\Custom Patched Modules `_. + +.. note:: + Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the `vllm-hpu-extention `_ package. + +.. note:: + ``QUANT_CONFIG`` is an environment variable that points to the measurement or quantization `JSON config file `_. + The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference. + +Run Online Inference Using FP8 +------------------------------- + +Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command: + +.. code-block:: bash + + export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json + vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8 + +.. tip:: + If you are just prototyping or testing your model with FP8, you can use the ``VLLM_SKIP_WARMUP=true`` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop. + +.. tip:: + When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables: + ``VLLM_ENGINE_ITERATION_TIMEOUT_S`` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. + ``VLLM_RPC_TIMEOUT`` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes. + +Run Offline Inference Using FP8 +------------------------------- + +To run offline inference (after completing the model calibration process): +* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode. +* Pass ``quantization=inc`` and ``kv_cache_dtype=fp8_inc`` as parameters to the ``LLM`` object. +* Call shutdown method of the model_executor at the end of the run. + +.. code-block:: python + + from vllm import LLM + llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc") + ... + # Call llm.generate on the required prompts and sampling params. + ... + llm.llm_engine.model_executor.shutdown() + +Specifying Device for the Model's Weights Uploading +--------------------------------------------------- + +It is possible to load the unquantized weights on a different device before quantizing them, then moving them to the device on which the model will run. +This reduces the device memory footprint of model weights, as only quantized weights are stored in device memory. +To set the device to upload weights, use the ``weights_load_device`` parameter for the ``LLM`` object, or ``--weights-load-device`` command line parameter when running online inference: + +.. code-block:: python + + from vllm import LLM + llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc", weights_load_device="cpu") diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index 09f8e7112cf0c..56ffa59bc1df7 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -7,7 +7,7 @@ The table below shows the compatibility of various quantization implementations .. list-table:: :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 + :widths: 20 8 8 8 8 8 8 8 8 8 8 8 * - Implementation - Volta @@ -17,6 +17,7 @@ The table below shows the compatibility of various quantization implementations - Hopper - AMD GPU - Intel GPU + - Intel Gaudi - x86 CPU - AWS Inferentia - Google TPU @@ -28,6 +29,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✗ - ✅︎ + - ✗ - ✅︎ - ✗ - ✗ @@ -39,6 +41,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✗ - ✅︎ + - ✗ - ✅︎ - ✗ - ✗ @@ -53,6 +56,7 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ * - INT8 (W8A8) - ✗ - ✅︎ @@ -61,6 +65,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✗ - ✗ + - ✗ - ✅︎ - ✗ - ✗ @@ -75,6 +80,7 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ * - AQLM - ✅︎ - ✅︎ @@ -86,6 +92,7 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ * - bitsandbytes - ✅︎ - ✅︎ @@ -97,6 +104,7 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ * - DeepSpeedFP - ✅︎ - ✅︎ @@ -108,6 +116,7 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ * - GGUF - ✅︎ - ✅︎ @@ -119,6 +128,19 @@ The table below shows the compatibility of various quantization implementations - ✗ - ✗ - ✗ + - ✗ + * - INC (W8A8) + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + - ✗ Notes: ^^^^^^ diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 314cd98212e9c..d75f146ecf4b8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -13,6 +13,7 @@ class HpuPlatform(Platform): device_name: str = "hpu" device_type: str = "hpu" dispatch_key: str = "HPU" + supported_quantization: list[str] = ["inc"] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: