Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
fix embed_out
Browse files Browse the repository at this point in the history
Signed-off-by: changwangss <[email protected]>
  • Loading branch information
changwangss committed Aug 5, 2024
1 parent 0bc8428 commit a08344a
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ def build_woq_model(model, quantization_config):
from neural_compressor.adaptor.torch_utils.util import set_module
weight_dtype = quantization_config.weight_dtype
for n, m in model.named_modules():
print(n)
if n in quantization_config.llm_int8_skip_modules:
# import pdb;pdb.set_trace();
continue
if isinstance(m, torch.nn.Linear):
zp = getattr(
Expand Down
6 changes: 3 additions & 3 deletions intel_extension_for_transformers/transformers/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,9 +832,9 @@ def __init__(
self.double_quant_use_sym = double_quant_use_sym
self.double_quant_group_size = double_quant_group_size
# "transformer.output_layer" for chatglm series model.
# "gpt_neox.embed_out" for dolly v2 series model.
self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules",
["lm_head", "transformer.output_layer", "gpt_neox.embed_out"])
# "embed_out" for dolly v2 series model.
self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules",
["lm_head", "transformer.output_layer", "embed_out"])
self.use_ggml = use_ggml
self.use_quant = use_quant
self.use_neural_speed = use_neural_speed
Expand Down

0 comments on commit a08344a

Please sign in to comment.