From d7d609fb7b99dcab54c851c7f61430345d441a1b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:01:57 +0300
Subject: [PATCH] Revert "Support loading checkpoints quantized using Autofp8
 (#286)"

This reverts commit 29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035.
---
 .../layers/fused_moe/fused_moe.py             |  4 --
 .../compressed_tensors/compressed_tensors.py  |  9 ++---
 .../schemes/compressed_tensors_w8a8_fp8.py    |  4 +-
 .../model_executor/layers/quantization/fp8.py | 24 ++++-------
 .../layers/quantization/utils/w8a8_utils.py   | 40 ++++---------------
 vllm/worker/hpu_model_runner.py               |  3 +-
 6 files changed, 22 insertions(+), 62 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 52f748675f752..b1d3bc0a5f054 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -13,10 +13,6 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 logger = init_logger(__name__)
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 670a27d5076bc..abb18d31b5a82 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -245,10 +245,8 @@ def _get_scheme_from_parts(
         # TODO @dsikka: clean-up conditions
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
-                is_fp8_w8a8_supported = current_platform.is_hpu() or \
-                    self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(),
-                    error=False)
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
@@ -320,8 +318,7 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        if not current_platform.is_hpu():
-            self._check_scheme_supported(scheme.get_min_capability())
+        self._check_scheme_supported(scheme.get_min_capability())
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 29f3228c0dc5d..5931ec36c97d5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -13,7 +13,6 @@
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
-from vllm.platforms import current_platform
 from vllm.utils import is_hip
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
@@ -24,8 +23,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = not current_platform.is_hpu() and \
-                                     cutlass_fp8_supported()
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 88915942220ca..b5feb55db0e74 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,10 +28,6 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_hip, print_warning_once
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
@@ -120,18 +116,14 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        if current_platform.is_cuda_alike():
-            self.cutlass_fp8_supported = cutlass_fp8_supported()
-
-            # For GPUs that lack FP8 hardware support, we can leverage the
-            # Marlin kernel for fast weight-only FP8 quantization
-            self.use_marlin = (not current_platform.has_device_capability(89)
-                               or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-            # Disable marlin for rocm
-            if is_hip():
-                self.use_marlin = False
-        else:
-            self.cutlass_fp8_supported = False
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if is_hip():
             self.use_marlin = False
 
     def create_weights(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index bf1aa6fbd5dca..411af922149fd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,11 +10,6 @@
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
 
-if current_platform.is_hpu():
-    import habana_frameworks.torch.utils.experimental as htexp
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 
 def cutlass_fp8_supported() -> bool:
     # cutlass is not supported on Rocm
@@ -30,15 +25,7 @@ def cutlass_fp8_supported() -> bool:
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
-    dtype = torch.float16
-    device = tensor.device
-    if current_platform.is_hpu():
-        dtype = torch.bfloat16
-        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
-            #dequant on cpu to avoid nan on gaudi2
-            tensor = tensor.to('cpu')
-
-    fake_qweight = tensor.to(dtype).to(device)
+    fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
 
@@ -71,10 +58,7 @@ def requantize_with_max_scale(
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
-    if current_platform.is_hpu() and htexp._get_device_type(
-    ) == htexp.synDeviceType.synDeviceGaudi2:
-        max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
-                                     torch.finfo(torch.float8_e4m3fnuz).max)
+
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
     # N weight scales for N shards but we only load 1 weight scale
@@ -145,20 +129,12 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            if current_platform.is_hpu():
-                #hpu does not support torch._scaled_mm (SW-197036)
-                output = torch.ops.hpu.fp8_gemm_v2(qinput, False, weight,
-                                                   False, None, input.dtype,
-                                                   x_scale, weight_scale, None,
-                                                   False)
-            else:
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=input.dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      out_dtype=input.dtype,
+                                      scale_a=x_scale,
+                                      scale_b=weight_scale,
+                                      bias=bias)
             # A fix for discrepancy in scaled_mm which returns tuple
             # for torch < 2.5 and a single value in torch >= 2.5
             if type(output) is tuple and len(output) == 2:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 47bcda1f753af..72052b0adeb31 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -587,7 +587,8 @@ def _set_gc_threshold(self) -> None:
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        htcore.hpu_set_env()
+        if self.model_config.quantization == 'inc':
+            htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(model_config=self.model_config,