From cacb9baa3f4f65879ef4939a2916d5ca54a98b00 Mon Sep 17 00:00:00 2001 From: Nir David Date: Sun, 15 Dec 2024 17:08:00 +0200 Subject: [PATCH] Add inc fp8 qunatization documentation --- .../getting_started/gaudi-installation.rst | 2 +- docs/source/quantization/inc.rst | 71 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 docs/source/quantization/inc.rst diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index ee733afd27578..4457d3be7a12e 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -46,7 +46,7 @@ To verify that the Intel Gaudi software was correctly installed, run: $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed - $ pip list | grep neural # verify that neural_compressor is installed + $ pip list | grep neural # verify that neural_compressor_pt is installed Refer to `System Verification and Final Tests `__ for more details. diff --git a/docs/source/quantization/inc.rst b/docs/source/quantization/inc.rst new file mode 100644 index 0000000000000..899f8758981ec --- /dev/null +++ b/docs/source/quantization/inc.rst @@ -0,0 +1,71 @@ +.. _INC: + +FP8 INC +================== + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using INC (Intel Neural Compressor) on hardware acceleration of Intel Gaudi (HPU). +Currently, only Llama models quntization are supported. + +Please visit the Intel Gaudi documentation of `Run Inference Using FP8 `_. + +In order to run Inference it is required to have Measurements/Scales files: + +Retrieve Measurements +--------------------- + +To obtain measurement files: +* Use the "inc" quantization method (as parameter to the LLM object). +* Call shutdown_inc and shutdown methods of the model_executor in the end of the run. + +.. code-block:: python + + from vllm import LLM + llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc") + ... + # Call llm.generate on the required prompts and sampling params. + ... + llm.llm_engine.model_executor.shutdown_inc() + llm.llm_engine.model_executor.shutdown() + +.. note:: + + Make sure to supply the "QUANT_CONFIG" enviornment variable which points to the `Json config file `_ with MEASURE mode. + +Run Inference Using FP8 +----------------------- + +Inte Gaudi supports quantization of Linear Layers, KV-Cache and functions like Matmul and Softamx as shown in: +`Supported Modules `_. +`Supported Functions `_. + +In order to run Inference it requies to have Scales which located in scale files according to the `Json config file `_ dump_stats_path. +If none exist they can be generated during inference run using the measurement files (should be loacted in the same folder). + +To run inference (and obtain scale files): +* Use the "inc" quantization method (as parameter to the LLM object). +* Use the "fp8_inc" kv cache dtype (as parameter to the LLM object). +* Call shutdown method of the model_executor in the end of the run. + +.. code-block:: python + + from vllm import LLM + llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc") + ... + # Call llm.generate on the required prompts and sampling params. + ... + llm.llm_engine.model_executor.shutdown() + +.. note:: + + Make sure to supply the "QUANT_CONFIG" enviornment variable which points to the `Json config file `_ with QUANTIZE mode. + +Specifying Device for the Model's Weights Uploading +--------------------------------------------------- + +It is possible to upload the (unquantized) weights on a different device before qunantizing them +and moving to the device on which the model will run. +Use the weights_load_device parameter for the LLM object to specify this device. +.. code-block:: python + from vllm import LLM + llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc", weights_load_device="cpu") +