From 590024d23c4dad4a69e6c44f639256e00d0fa970 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Thu, 4 Jul 2024 14:00:46 +0800
Subject: [PATCH 1/6] remove optimum-intel version limit (#1651)

Signed-off-by: changwangss <chang1.wang@intel.com>
Co-authored-by: chensuyue <suyue.chen@intel.com>
---
 .github/workflows/script/install_binary.sh                     | 3 ++-
 .github/workflows/unit-test-optimize.yml                       | 2 +-
 .../pytorch/text-generation/quantization/requirements_sq.txt   | 2 +-
 tests/requirements.txt                                         | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/script/install_binary.sh b/.github/workflows/script/install_binary.sh
index 7bca0d4d2f3..a33a6607b1b 100644
--- a/.github/workflows/script/install_binary.sh
+++ b/.github/workflows/script/install_binary.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+
 source /intel-extension-for-transformers/.github/workflows/script/change_color.sh
 
 cd /intel-extension-for-transformers
@@ -10,7 +11,7 @@ git config --global --add safe.directory "*"
 git submodule update --init --recursive
 
 
-$BOLD_YELLOW && echo "---------------- run python setup.py sdist bdist_wheel -------------" && $RESET
+$BOLD_YELLOW && echo "---------------- run python setup.py bdist_wheel -------------" && $RESET
 python setup.py bdist_wheel
 
 
diff --git a/.github/workflows/unit-test-optimize.yml b/.github/workflows/unit-test-optimize.yml
index 4d11947d92c..6399df03878 100644
--- a/.github/workflows/unit-test-optimize.yml
+++ b/.github/workflows/unit-test-optimize.yml
@@ -67,7 +67,7 @@ jobs:
         with:
           submodules: "recursive"
           ref: ${{ matrix.test_branch }}
-          fetch-tags: true
+          fetch-depth: 0
 
       - name: Docker Build
         run: |
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
index 047f65d091a..1ce834dda7b 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
+++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
@@ -7,7 +7,7 @@ sentencepiece != 0.1.92
 torch==2.3.0+cpu
 transformers==4.38.1
 intel_extension_for_pytorch==2.3.0
-optimum-intel==1.16.1
+optimum-intel
 bitsandbytes  #baichuan
 transformers_stream_generator
 tiktoken  #qwen
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 4dbac7def89..4ae2eb85d49 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -16,7 +16,7 @@ mlflow
 nlpaug==1.1.9
 onnx
 onnxruntime
-optimum-intel==1.16.1
+optimum-intel
 peft==0.6.2
 py-cpuinfo
 sacremoses

From c57621131fa0154ff4a1917662c5befba008bd4d Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Fri, 5 Jul 2024 13:13:16 +0800
Subject: [PATCH 2/6] fix optimum-intel version (#1654)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../pytorch/code-generation/quantization/Dockerfile-multiple    | 2 +-
 .../pytorch/code-generation/quantization/requirements.txt       | 2 +-
 .../pytorch/text-generation/quantization/requirements_GPU.txt   | 2 +-
 .../pytorch/text-generation/quantization/requirements_sq.txt    | 2 +-
 tests/requirements.txt                                          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple b/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple
index a903d161614..d58375c38b4 100644
--- a/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple
+++ b/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple
@@ -61,7 +61,7 @@ ENV COMPOSE_DOCKER_CLI_BUILD=0
 # Install torch and intel-extension-for-pytorch 2.1
 RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 RUN python3 -m pip install intel-extension-for-pytorch intel-extension-for-transformers optimum
-RUN python3 -m pip install git+https://github.com/huggingface/optimum-intel.git@f95dea1ae8966dee4d75d622e7b2468c514ba02d
+RUN python3 -m pip install git+https://github.com/huggingface/optimum-intel.git@50d867c13b22c22eda451ddb67bddb8159670f85
 RUN python3 -m pip install git+https://github.com/bigcode-project/bigcode-evaluation-harness@0d84db85f9ff971fa23a187a3347b7f59af288dc
 
 # Standard requirements
diff --git a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt
index 5f02605d7d0..455eccd2b26 100644
--- a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt
+++ b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt
@@ -10,6 +10,6 @@ transformers >= 4.35.0
 tiktoken #code_gen
 neural-compressor
 intel_extension_for_pytorch==2.3.0
-optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@50d867c13b22c22eda451ddb67bddb8159670f85
 auto-round==0.2
 git+https://github.com/bigcode-project/bigcode-evaluation-harness@094c7cc197d13a53c19303865e2056f1c7488ac1
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
index 65980710432..1b000e0c61b 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
+++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
@@ -7,7 +7,7 @@ sentencepiece != 0.1.92
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 torch==2.1.0a0
 transformers
-optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@50d867c13b22c22eda451ddb67bddb8159670f85
 bitsandbytes  #baichuan
 transformers_stream_generator
 tiktoken  #qwen
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
index 1ce834dda7b..02655339b5d 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
+++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
@@ -7,7 +7,7 @@ sentencepiece != 0.1.92
 torch==2.3.0+cpu
 transformers==4.38.1
 intel_extension_for_pytorch==2.3.0
-optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@50d867c13b22c22eda451ddb67bddb8159670f85
 bitsandbytes  #baichuan
 transformers_stream_generator
 tiktoken  #qwen
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 4ae2eb85d49..d2c2dca3f74 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -7,6 +7,7 @@ datasets==2.16.1
 einops
 evaluate
 gguf
+git+https://github.com/huggingface/optimum-intel.git@50d867c13b22c22eda451ddb67bddb8159670f85
 git+https://github.com/intel/neural-compressor.git
 git+https://github.com/intel/neural-speed.git
 intel-extension-for-pytorch==2.3.0
@@ -16,7 +17,6 @@ mlflow
 nlpaug==1.1.9
 onnx
 onnxruntime
-optimum-intel
 peft==0.6.2
 py-cpuinfo
 sacremoses

From 3e85ca9f8a5d9e4fc5879b20b63a29e68987c972 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Fri, 5 Jul 2024 17:20:54 +0800
Subject: [PATCH 3/6] Support huggingface popular weight format for weight-only
 quantization (#1580)

* Support huggingface popular weight format for weight-only quantization

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>

* Fixed issue of loading woq model for intel gpu

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>

* update qconfig for xpu

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>

---------

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
Co-authored-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 .../quantization/run_generation_gpu_woq.py    |  7 +---
 .../transformers/llm/quantization/utils.py    | 33 +++++++++++++------
 .../transformers/modeling/modeling_auto.py    | 10 +++---
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py
index 9363b45cf5f..d54dd5f127f 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py
@@ -142,12 +142,7 @@
 
 user_model = None
 
-# tokenizer
-if config.model_type == "llama":
-   from transformers import LlamaTokenizer
-   tokenizer = LlamaTokenizer.from_pretrained(args.model)
-else:
-   tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
 
 quantization_config = None
 if args.woq:
diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py
index 81bf61879e2..f912135db1a 100644
--- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py
+++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py
@@ -20,9 +20,8 @@
 import gc
 import math
 import os
-from ...utils import CpuInfo
+from ....tools.utils import _ipex_version
 from accelerate import init_empty_weights
-from datasets import load_dataset
 from neural_compressor import quantization
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
 from neural_compressor.utils.utility import LazyImport
@@ -31,7 +30,6 @@
     is_ipex_available,
     is_autoround_available,
 )
-from transformers import AutoTokenizer
 
 if is_ipex_available():
     import intel_extension_for_pytorch as ipex
@@ -273,10 +271,12 @@ def _replace_linear(
                             scale_dtype=quantization_config.scale_dtype,
                             blocksize=quantization_config.group_size,
                             scheme=quantization_config.scheme,
-                            compression_dtype=getattr(module, "compression_dtype", torch.int8),
-                            compression_dim=getattr(module, "compression_dim", 0),
+                            compression_dtype=getattr(module, "compression_dtype",
+                                                      torch.int8 if _ipex_version < "2.3.10" else torch.int32),
+                            compression_dim=getattr(module, "compression_dim", 0 if _ipex_version < "2.3.10" else 1),
                             device=device,
-                            use_optimum_format=getattr(module, "use_optimum_format", False),
+                            use_optimum_format=getattr(module, "use_optimum_format",
+                                                       False if _ipex_version < "2.3.10" else True),
                         )
                         if quantization_config.quant_method.value == "gptq":
                             g_idx = getattr(module, "g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
@@ -297,6 +297,17 @@ def _replace_linear(
                                         quantization_config.compute_dtype
                                     ),
                                     device=torch.device(device),
+                                ) if _ipex_version < "2.3.10" else torch.ones(
+                                    (
+                                        math.ceil(
+                                            in_features / quantization_config.group_size
+                                        ),
+                                        out_features,
+                                    ),
+                                    dtype=convert_dtype_str2torch(
+                                        quantization_config.compute_dtype
+                                    ),
+                                    device=torch.device(device),
                                 )
                             ),
                             module.qzeros if hasattr(module, "qzeros") else None,
@@ -348,11 +359,13 @@ def _replace_linear(
                 else:
                     if not hasattr(module, "qweight"):
                         n_pack = (
-                            8 // DTYPE_BITS_MAPPING[quantization_config.weight_dtype]
+                            (8 if _ipex_version < "2.3.10" else 32)
+                            // DTYPE_BITS_MAPPING[quantization_config.weight_dtype]
                         )
                         weight = torch.zeros(
-                            (math.ceil(out_features / n_pack), in_features),
-                            dtype=torch.int8,
+                            (math.ceil(out_features / n_pack), in_features) if _ipex_version < "2.3.10" else
+                            (math.ceil(in_features / n_pack), out_features),
+                            dtype=torch.int8 if _ipex_version < "2.3.10" else torch.int32,
                             device=torch.device(device),
                         )
                     model._modules[name].set_weights_bias(
@@ -592,7 +605,7 @@ def default_calib_func(model):
                 use_optimum_format=False,
                 scale_dtype=convert_dtype_str2torch(config.scale_dtype),
                 device="xpu",
-            )
+            ) if _ipex_version < "2.3.10" else inc_model.export_compressed_model(use_optimum_format=True, device="xpu")
 
             q_model = replace_linear(model, None, None, config, device=device)
         else:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index a5be8cdc519..1314e464eff 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -182,7 +182,7 @@ def convert_model_to_public(model):
     # reorder weight and scales if they have been transposed
     if model.device == "xpu" or (isinstance(model.device, torch.device) and model.device.type == "xpu"):
         for name, module in model.named_modules():
-            if isinstance(module, WeightOnlyQuantizedLinear):
+            if isinstance(module, WeightOnlyQuantizedLinear) and not module.use_optimum_format:
                 if module.weight_transposed:
                     module.qweight.data = module.qweight.t_().contiguous()
                     module.scales.data = module.scales.t_().contiguous()
@@ -198,6 +198,7 @@ def convert_model_to_public(model):
     ]:
         model = recover_export_model(model)
 
+
 def make_contiguous(model):
     for param in model.parameters():
         if param.data.ndimension() > 1:
@@ -1871,7 +1872,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # weight dtype is higher priority than bits in config.json when both existed.
         if quantization_config.weight_dtype is None:
             if quantization_config.bits == 4:
-                quantization_config.weight_dtype = "int4_clip"
+                if use_xpu:
+                    quantization_config.weight_dtype = "int4_fullrange"
+                else:
+                    quantization_config.weight_dtype = "int4_clip"
                 logger.info(
                     "{} quantization weight_dtype is used due to bits is 4 in config.json.".format(
                         quantization_config.weight_dtype)
@@ -1917,7 +1921,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             "fp4_e2m1",
             "fp4_e2m1_bnb",
             "nf4",
-            "int4_fullrange",
         ]:
             model = build_woq_model(model, quantization_config)
         else:
@@ -2025,7 +2028,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]):
             "nf4",
             "fp4_e2m1",
             "fp4_e2m1_bnb",
-            "int4_fullrange",
         ] and not quantization_config.use_ipex:
             model = replace_linear(
                 model,

From e79a71c2b9e0c7e2e2241ff7f6e6584a5d9eafd9 Mon Sep 17 00:00:00 2001
From: Tyler Titsworth <tyler.titsworth@intel.com>
Date: Mon, 8 Jul 2024 18:21:00 -0700
Subject: [PATCH 4/6] Apply regreSSHion mitigation (#1657)

Signed-off-by: Tyler Titsworth <tyler.titsworth@intel.com>
---
 docker/Dockerfile_chatbot | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile_chatbot b/docker/Dockerfile_chatbot
index 3bdb8cf9e05..2908acf8b69 100644
--- a/docker/Dockerfile_chatbot
+++ b/docker/Dockerfile_chatbot
@@ -153,6 +153,7 @@ RUN apt update \
     && apt install -y wget numactl git nvidia-cuda* \
     && apt install -y openssh-server \
     && apt install -y python${PYTHON_VERSION} python3-pip \
+    && echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config \# https://ubuntu.com/security/CVE-2024-6387
     && apt clean \
     && rm -rf /var/lib/apt/lists/*
 RUN ln -s /usr/bin/python3 /usr/bin/python

From 20765abeddd2f2ce1f16eb5ab7354c51bb4417fb Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Tue, 9 Jul 2024 14:02:29 +0800
Subject: [PATCH 5/6] update recipes to align INC2.6 (#1612)

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .../quantization/llm_quantization_recipes.md  | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md
index e65b975284a..4fd7a3cda14 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md
@@ -40,8 +40,9 @@ pip install -v .
 # install requirements
 cd examples/huggingface/pytorch/text-generation/quantization
 pip install -r requirements.txt
-pip install neural-compressor==2.5
-pip install transformers==4.35.2
+pip install neural-compressor==2.6
+pip install transformers==4.38.1
+
 pip install torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu
 pip install intel-extension-for-pytorch==2.3.0
 ```
@@ -221,10 +222,11 @@ python run_generation_sq.py \
     --calib_len 2048 \
     --fallback_add \
     --calib_shuffle False \
+    --calib_iters 512 \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.8, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}"
+    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.79, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}"
 ```
 
 ### Weight-Only Quantization
@@ -276,11 +278,12 @@ python run_generation_sq.py \
     --trust_remote_code \
     --calib_len 1024 \
     --fallback_add \
+    --calib_iters 512
     --calib_padding \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max'}}}"
+    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max', 'n_samples':64}}}"
 ```
 
 ### Weight-Only Quantization
@@ -544,7 +547,7 @@ python run_generation_sq.py \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --alpha 0.65
+    --alpha 1.0
 ```
 
 ### Weight-Only Quantization
@@ -650,8 +653,10 @@ python run_generation_sq.py \
     --trust_remote_code \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
+    --calib_iters 512
     --batch_size 1 \
-    --alpha 0.75
+    --recipes "{'smooth_quant':True,'smooth_quant_args':{'alpha':'auto','folding':False,'default_alpha':0.7,'auto_alpha_args':{'alpha_min':0.55,'alpha_max':0.8,'alpha_step':0.01,'shared_criterion':'mean','n_samples':64}}}" \
+    --calib_iters 512
 ```
 
 ### Weight-Only Quantization
@@ -702,8 +707,8 @@ python run_generation_sq.py \
     --trust_remote_code \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
-    --batch_size 1 \
-    --alpha 0.9
+    --recipes "{'smooth_quant':True,'smooth_quant_args':{'alpha':'auto','folding':False,'default_alpha':0.85,'auto_alpha_args':{'alpha_min':0.79,'alpha_max':0.88,'alpha_step':0.01,'shared_criterion':'mean'}}}" \
+    --batch_size 1
 ```
 
 ### Weight-Only Quantization

From 3fd99c853c570dfcf3ba6aeeff5c034ad07c4f1d Mon Sep 17 00:00:00 2001
From: "Wang, Zhe" <zhe1.wang@intel.com>
Date: Wed, 10 Jul 2024 10:39:21 +0800
Subject: [PATCH 6/6] qbits support f4 weight repack (#1653)

* qbits support f4 weight repack

* fix
---
 .../include/bestla_weightonly_dispatcher.hpp  |  1 -
 .../dispatcher/include/dispatcher_utils.hpp   |  7 +++
 .../dispatcher/src/bestla_packq_impl.cpp      | 51 +++++++++++++------
 .../src/bestla_weightonly_dispatcher.cpp      | 16 ++----
 4 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/intel_extension_for_transformers/qbits/dispatcher/include/bestla_weightonly_dispatcher.hpp b/intel_extension_for_transformers/qbits/dispatcher/include/bestla_weightonly_dispatcher.hpp
index 1f33cfc663b..784c512220f 100644
--- a/intel_extension_for_transformers/qbits/dispatcher/include/bestla_weightonly_dispatcher.hpp
+++ b/intel_extension_for_transformers/qbits/dispatcher/include/bestla_weightonly_dispatcher.hpp
@@ -14,7 +14,6 @@
 #pragma once
 #include <ATen/core/TensorBody.h>
 #include <torch/torch.h>
-#include "bestla/bestla_storage.h"
 #include "../include/dispatcher_utils.hpp"
 #include <string.h>
 #include <assert.h>
diff --git a/intel_extension_for_transformers/qbits/dispatcher/include/dispatcher_utils.hpp b/intel_extension_for_transformers/qbits/dispatcher/include/dispatcher_utils.hpp
index 8a0c99b3b3a..05a8c718b26 100644
--- a/intel_extension_for_transformers/qbits/dispatcher/include/dispatcher_utils.hpp
+++ b/intel_extension_for_transformers/qbits/dispatcher/include/dispatcher_utils.hpp
@@ -16,6 +16,7 @@
 #include <chrono>
 #include <string>
 #include "bestla/bestla_device.h"
+#include "bestla/bestla_storage.h"
 #include "bestla/bestla_utils.h"
 #include "bestla/bestla_parallel.h"
 namespace dispatcher_utils {
@@ -26,6 +27,12 @@ inline bool check_avx_vnni() { return bestla::device::CpuDevice::getInstance()->
 inline bool check_avx512f() { return bestla::device::CpuDevice::getInstance()->AVX512F(); }
 inline bool check_avx2() { return bestla::device::CpuDevice::getInstance()->AVX2(); }
 
+template <class GemmCore>
+constexpr bool is_int8_cmpt_gemmcore() {
+  return GemmCore::ISA == BTLA_ISA::AMX_INT8 || GemmCore::ISA == BTLA_ISA::AVX512_VNNI ||
+         GemmCore::ISA == BTLA_ISA::AVX_VNNI || std::is_same_v<GemmCore, bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>>;
+}
+
 class qbits_threading {
  public:
   static bestla::parallel::IThreading* get() {
diff --git a/intel_extension_for_transformers/qbits/dispatcher/src/bestla_packq_impl.cpp b/intel_extension_for_transformers/qbits/dispatcher/src/bestla_packq_impl.cpp
index 399deaba7e0..cf6889a9f15 100644
--- a/intel_extension_for_transformers/qbits/dispatcher/src/bestla_packq_impl.cpp
+++ b/intel_extension_for_transformers/qbits/dispatcher/src/bestla_packq_impl.cpp
@@ -16,12 +16,19 @@
 #include "../include/bestla_packq_impl.hpp"
 
 namespace woq {
-template <class GemmCore, BTLA_ISA ISA>
+
+template <class proB>
 void execute_qpack(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
-  using proB = bestla::prologue_b::gemm::WeightKBlockNInteger<GemmCore, ISA>;
   static proB ker;
-  auto qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
-                                  scale2bestladt_map.at(p->scale_type), BTLA_DTYPE::BF16, p->asym);
+  using WType = typename proB::StorageWeight;
+  WType qpackw(0);
+  if constexpr (std::is_same_v<WType, bestla::storage::gemm::StorageWeightKBlockNInteger>) {
+    qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
+                               scale2bestladt_map.at(p->scale_type), BTLA_DTYPE::BF16, p->asym);
+  } else {
+    qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
+                               scale2bestladt_map.at(p->scale_type));
+  }
   if (p->enable_act_shuffle) ker.enableShuffle(&qpackw);
   ctx->packw_size = qpackw.mSize;
   if (task == WOQ_GET_PACKW_SIZE) return;
@@ -33,6 +40,20 @@ void execute_qpack(repack_quantized_weight_param* p, repack_quantized_weight_ctx
                   p->asym ? ctx->zp->data_ptr<int8_t>() : nullptr, &qpackw, dispatcher_utils::qbits_threading::get());
 }
 
+template <class GemmCore, BTLA_ISA ISA>
+void parse_prob(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
+  if (p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
+      p->weight_type == "int2_clip") {
+    return execute_qpack<bestla::prologue_b::gemm::WeightKBlockNInteger<GemmCore, ISA>>(p, ctx, task);
+  }
+  if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1") {
+    TORCH_CHECK(!p->asym, "Qbits: float-weight unsupports asym quantization.");
+    return execute_qpack<bestla::prologue_b::gemm::WeightKBlockNFloat<GemmCore, ISA>>(p, ctx, task);
+  }
+  TORCH_CHECK(false, "Qbits: unsupported bestla packq config, compute_type: " + p->compute_type +
+                         " weight_type: " + p->weight_type);
+}
+
 std::string get_dtype_str(BTLA_DTYPE dtype) {
   switch (dtype) {
     case BTLA_DTYPE::F32:
@@ -183,40 +204,38 @@ torch::Tensor get_packw_info(torch::Tensor& packw, PACKW_ACQUIRE_TYPE ACQ_T) {
 }
 
 void bestla_packq(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
-  // TODO(zhe): elegant impl.
-  TORCH_CHECK(p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
-                  p->weight_type == "int2_clip",
-              "Qbits: only support Integer WOQ in PACKQ");
-
   if (p->compute_type == "int8") {
+    TORCH_CHECK(p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
+                    p->weight_type == "int2_clip",
+                "Qbits: only support Integer weight-type with int8 compute-type");
     if (dispatcher_utils::check_amx() && p->blocksize % bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>::KTILE == 0) {
-      return execute_qpack<bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>, BTLA_ISA::AMX_INT8>(p, ctx, task);
+      return parse_prob<bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>, BTLA_ISA::AMX_INT8>(p, ctx, task);
     }
     if (dispatcher_utils::check_avx512_vnni() &&
         p->blocksize % bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>::KTILE == 0) {
-      return execute_qpack<bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(p, ctx, task);
+      return parse_prob<bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(p, ctx, task);
     }
     if (dispatcher_utils::check_avx_vnni() && p->blocksize % bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>::KTILE == 0) {
-      return execute_qpack<bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(p, ctx, task);
+      return parse_prob<bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(p, ctx, task);
     }
     if (dispatcher_utils::check_avx2() && p->blocksize % bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>::KTILE == 0) {
-      return execute_qpack<bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>, BTLA_ISA::AVX2>(p, ctx, task);
+      return parse_prob<bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>, BTLA_ISA::AVX2>(p, ctx, task);
     }
     TORCH_CHECK(false, "Qbits: Illegal config in int8 compute_type, blocksize:", p->blocksize,
                 ", ISA support avx2:", dispatcher_utils::check_avx2());
   }
   if (p->compute_type == "fp32") {
     if (dispatcher_utils::check_avx512f()) {
-      return execute_qpack<bestla::gemm::SCoreRowNAvx512f<48, 8>, BTLA_ISA::AVX512F>(p, ctx, task);
+      return parse_prob<bestla::gemm::SCoreRowNAvx512f<48, 8>, BTLA_ISA::AVX512F>(p, ctx, task);
     }
     if (dispatcher_utils::check_avx2()) {
-      return execute_qpack<bestla::gemm::SCoreRowNAvx2<24, 4>, BTLA_ISA::AVX2>(p, ctx, task);
+      return parse_prob<bestla::gemm::SCoreRowNAvx2<24, 4>, BTLA_ISA::AVX2>(p, ctx, task);
     }
     TORCH_CHECK(false, "Qbits: device ISA must support BTLA_ISA::AVX2 when compute_type==fp32");
   }
   if (p->compute_type == "bf16") {
     if (dispatcher_utils::check_amx()) {
-      return execute_qpack<bestla::gemm::HCoreRowNAmxbf16<64, 16>, BTLA_ISA::AMX_BF16>(p, ctx, task);
+      return parse_prob<bestla::gemm::HCoreRowNAmxbf16<64, 16>, BTLA_ISA::AMX_BF16>(p, ctx, task);
     }
     TORCH_CHECK(false, "Qbits: device ISA must support AMX-BF16 when compute_type==bf16");
   }
diff --git a/intel_extension_for_transformers/qbits/dispatcher/src/bestla_weightonly_dispatcher.cpp b/intel_extension_for_transformers/qbits/dispatcher/src/bestla_weightonly_dispatcher.cpp
index f9864ddece0..c04e652a4aa 100644
--- a/intel_extension_for_transformers/qbits/dispatcher/src/bestla_weightonly_dispatcher.cpp
+++ b/intel_extension_for_transformers/qbits/dispatcher/src/bestla_weightonly_dispatcher.cpp
@@ -43,12 +43,6 @@ concept quant_PrologueA = requires {
   requires !std::is_same_v<T, bestla::utils::bf16>;
 };
 
-template <class GemmCore>
-constexpr bool is_int8_cmpt_gemmcore() {
-  return GemmCore::ISA == BTLA_ISA::AMX_INT8 || GemmCore::ISA == BTLA_ISA::AVX512_VNNI ||
-         GemmCore::ISA == BTLA_ISA::AVX_VNNI || std::is_same_v<GemmCore, bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>>;
-}
-
 template <class Launcher>
 void dequantize_packed_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
   if (dispatcher_utils::initer.verbose) dispatcher_utils::timer.start();
@@ -133,7 +127,7 @@ void do_compute(woq_config_param* p, woq_runtime_ctx* ctx, ParamA param_a) {
   using StorageWeight = typename Launcher::PrologueB::StorageWeight;
   size_t asym_size = 0, shuf_size = 0;
   int8_t* tmpbuf = nullptr;
-  if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
+  if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
     using Parallel = bestla::parallel::gemm::SchedulerKBlockS<GemmCore>;
     bestla::utils::GemmProblem gp(1, ctx->m, ctx->n, ctx->k, p->blocksize);
     StorageWeight* packedw = dynamic_cast<StorageWeight*>(ctx->deseries_wei);
@@ -236,7 +230,7 @@ void execute_task(woq_config_param* p, woq_runtime_ctx* ctx) {
 template <WOQ_TASK TASK, class GemmCore, template <class _T, BTLA_ISA> class PrologueB,
           template <class _T, BTLA_ISA> class PrologueA, template <BTLA_ISA> class Epilogue>
 void parse_launcher(woq_config_param* p, woq_runtime_ctx* ctx) {
-  if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
+  if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
     using Launcher = bestla::wrapper::gemm::LauncherIntKBlock<GemmCore::ISA, GemmCore, PrologueA, PrologueB, Epilogue>;
     return execute_task<TASK, Launcher>(p, ctx);
   } else {
@@ -260,7 +254,7 @@ template <WOQ_TASK TASK, class GemmCore, template <class _T, BTLA_ISA> class Pro
 void parse_activation(woq_config_param* p, woq_runtime_ctx* ctx) {
   using namespace bestla::prologue_a::gemm;
   if (p->src_dt == dispatcher_utils::QBITS_FP32) {
-    if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
+    if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
       return parse_store<TASK, GemmCore, PrologueB, ShuffleActivationKBlockQuantizeF32, dispatcher_utils::QBITS_FP32>(
           p, ctx);
     } else {
@@ -269,7 +263,7 @@ void parse_activation(woq_config_param* p, woq_runtime_ctx* ctx) {
     }
   }
   if (p->src_dt == dispatcher_utils::QBITS_BF16) {
-    if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
+    if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
       return parse_store<TASK, GemmCore, PrologueB, ShuffleActivationKBlockQuantizeBf16, dispatcher_utils::QBITS_BF16>(
           p, ctx);
     } else {
@@ -289,7 +283,7 @@ void parse_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
   if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1" ||
       p->weight_type == "fp8_e4m3" || p->weight_type == "fp8_e5m2") {
     TORCH_CHECK(!p->asym, "Qbits: float-weight unsupports asym quantization.");
-    if constexpr (!is_int8_cmpt_gemmcore<GemmCore>())
+    if constexpr (!dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>())
       return parse_activation<TASK, GemmCore, WeightKBlockNFloat>(p, ctx);
   }
   TORCH_CHECK(false,