From ef2a1d4ea5a6236e0f0bc5f2e1d7f668e240d476 Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 4 Aug 2024 21:19:38 -0700 Subject: [PATCH 1/7] fix int8 skip module config Signed-off-by: changwangss --- .../transformers/utils/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 2314512db63..9c77032d6b6 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -831,7 +831,10 @@ def __init__( self.double_quant_bits = double_quant_bits self.double_quant_use_sym = double_quant_use_sym self.double_quant_group_size = double_quant_group_size - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "output_layer", "embed_out"]) + # "transformer.output_layer" for chatglm series model. + # "gpt_neox.embed_out" for dolly v2 series model. + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "gpt_neox.embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed From 0bc842876280ac7b88c18d1e02b4005b62651338 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 04:20:53 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- intel_extension_for_transformers/transformers/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 9c77032d6b6..524ab619362 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -833,7 +833,7 @@ def __init__( self.double_quant_group_size = double_quant_group_size # "transformer.output_layer" for chatglm series model. # "gpt_neox.embed_out" for dolly v2 series model. - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "gpt_neox.embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant From a08344a795531a31a146902ce7d370fa25538751 Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 4 Aug 2024 21:26:24 -0700 Subject: [PATCH 3/7] fix embed_out Signed-off-by: changwangss --- .../transformers/modeling/modeling_auto.py | 2 ++ .../transformers/utils/config.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 63540e11a74..47f8b1e7e60 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -161,7 +161,9 @@ def build_woq_model(model, quantization_config): from neural_compressor.adaptor.torch_utils.util import set_module weight_dtype = quantization_config.weight_dtype for n, m in model.named_modules(): + print(n) if n in quantization_config.llm_int8_skip_modules: + # import pdb;pdb.set_trace(); continue if isinstance(m, torch.nn.Linear): zp = getattr( diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 524ab619362..5362d172cb8 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -832,9 +832,9 @@ def __init__( self.double_quant_use_sym = double_quant_use_sym self.double_quant_group_size = double_quant_group_size # "transformer.output_layer" for chatglm series model. - # "gpt_neox.embed_out" for dolly v2 series model. - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", - ["lm_head", "transformer.output_layer", "gpt_neox.embed_out"]) + # "embed_out" for dolly v2 series model. + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed From 7914ccc2ea25485658c4333e6caeec4e842d1169 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 04:27:05 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- intel_extension_for_transformers/transformers/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 5362d172cb8..48e72039d63 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -833,7 +833,7 @@ def __init__( self.double_quant_group_size = double_quant_group_size # "transformer.output_layer" for chatglm series model. # "embed_out" for dolly v2 series model. - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant From 1de8b17829e927cd5d6be2a9cdedf00cc819d181 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Mon, 5 Aug 2024 12:27:19 +0800 Subject: [PATCH 5/7] Update modeling_auto.py Signed-off-by: Wang, Chang --- .../transformers/modeling/modeling_auto.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 47f8b1e7e60..63540e11a74 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -161,9 +161,7 @@ def build_woq_model(model, quantization_config): from neural_compressor.adaptor.torch_utils.util import set_module weight_dtype = quantization_config.weight_dtype for n, m in model.named_modules(): - print(n) if n in quantization_config.llm_int8_skip_modules: - # import pdb;pdb.set_trace(); continue if isinstance(m, torch.nn.Linear): zp = getattr( From 827c95dea86d9553d82ca5d564a0871d002ba0a2 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Tue, 6 Aug 2024 11:00:03 +0800 Subject: [PATCH 6/7] Update config.py Signed-off-by: Wang, Chang --- .../transformers/utils/config.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 48e72039d63..de055309c6e 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -914,7 +914,8 @@ def __init__( self.true_sequential = true_sequential self.layer_wise = layer_wise self.seq_len = seq_len - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "output_layer", "embed_out"]) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed @@ -1012,7 +1013,8 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "output_layer", "embed_out"]) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed @@ -1081,7 +1083,8 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "output_layer", "embed_out"]) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") @@ -1157,7 +1160,8 @@ def __init__( self.iters = iters self.seq_len = seq_len self.quant_lm_head = quant_lm_head - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "output_layer", "embed_out"]) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + ["lm_head", "transformer.output_layer", "embed_out"]) if self.quant_lm_head: self.llm_int8_skip_modules = [] self.use_ggml = use_ggml From 439511cb3ac36f01d1c0535a8c8bd642073e49ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 03:00:46 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/utils/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index de055309c6e..3f328c2ff33 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -914,7 +914,7 @@ def __init__( self.true_sequential = true_sequential self.layer_wise = layer_wise self.seq_len = seq_len - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant @@ -1013,7 +1013,7 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_quant = use_quant @@ -1083,7 +1083,7 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "embed_out"]) self.use_ggml = use_ggml self.use_neural_speed = use_neural_speed @@ -1160,7 +1160,7 @@ def __init__( self.iters = iters self.seq_len = seq_len self.quant_lm_head = quant_lm_head - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", ["lm_head", "transformer.output_layer", "embed_out"]) if self.quant_lm_head: self.llm_int8_skip_modules = []