diff --git a/.github/workflows/unit-test-optimize.yml b/.github/workflows/unit-test-optimize.yml index 6399df03878..edfad46b42b 100644 --- a/.github/workflows/unit-test-optimize.yml +++ b/.github/workflows/unit-test-optimize.yml @@ -45,7 +45,7 @@ jobs: test_name: "PR-test" - test_branch: "main" test_name: "baseline" - fail-fast: true + fail-fast: false name: optimize-unit-test-${{ matrix.test_name }} steps: - name: Docker Clean Up diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index d54837e2220..bf8420f6b8f 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -67,7 +67,11 @@ convert_to_smoothquant_model, replace_linear, ) -from ...tools.utils import is_intel_gpu_available, is_ipex_available, _neural_compressor_version +from ...tools.utils import ( + is_intel_gpu_available, + is_ipex_available, + _neural_compressor_version, +) from accelerate import init_empty_weights from huggingface_hub import hf_hub_download from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear @@ -1832,7 +1836,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "int4_fullrange", ]: model = build_woq_model(model, quantization_config) else: @@ -1949,7 +1952,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): if quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "int4_fullrange", ] and not quantization_config.use_ipex: model = replace_linear( model,