Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
Fix SQ baichuan without position_ids for torch and ipex 2.3.0 (#1597)
Browse files Browse the repository at this point in the history
Signed-off-by: Wang, Chang <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
changwangss and pre-commit-ci[bot] authored Jun 11, 2024
1 parent 3f492c4 commit 14734de
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 91 deletions.
26 changes: 0 additions & 26 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -2268,32 +2268,6 @@
}
}
},
"baichuan_7b_gen_ipex_static": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "baichuan_7b",
"task": "generation",
"approach": "static",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "baichuan_7b",
"task": "generation",
"approach": "static",
"backend": "ipex",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"baichuan2_7b_gen_ipex_static": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ protobuf
sentencepiece != 0.1.92
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.3.0+cpu
transformers
transformers==4.38.1
intel_extension_for_pytorch==2.3.0
optimum-intel==1.16.1
bitsandbytes #baichuan
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,12 @@ function run_benchmark {
elif [ "${topology}" = "llama_7b" ]; then
model_name_or_path="meta-llama/Llama-2-7b-chat-hf"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "llama2_7b_gptq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
script="run_generation_cpu_woq.py"
elif [ "${topology}" = "llama_13b" ]; then
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "dolly_v2_3b" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
script="run_generation_sq.py"
Expand All @@ -137,47 +135,32 @@ function run_benchmark {
model_name_or_path="THUDM/chatglm3-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm2_6b" ]; then
model_name_or_path="THUDM/chatglm2-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm_6b" ]; then
model_name_or_path="THUDM/chatglm-6b"
script="run_generation_sq.py"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
elif [ "${topology}" = "falcon_7b" ]; then
model_name_or_path="tiiuae/falcon-7b-instruct"
script="run_generation_sq.py"
pip install transformers==4.33
elif [ "${topology}" = "baichuan_7b" ]; then
model_name_or_path="baichuan-inc/Baichuan-7B"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan_13b" ]; then
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_7b" ]; then
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_13b" ]; then
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "qwen_7b" ]; then
model_name_or_path="Qwen/Qwen-7B"
model_name_or_path="Qwen/Qwen-7B-Chat"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "mistral_7b" ]; then
model_name_or_path="Intel/neural-chat-7b-v3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,13 @@ function run_tuning {
model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "llama_13b" ]; then
alpha=0.8
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "dolly_v2_3b" ]; then
alpha=0.6
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
Expand All @@ -161,72 +159,54 @@ function run_tuning {
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm2_6b" ]; then
alpha=0.75
model_name_or_path="THUDM/chatglm2-6b"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.35.2
elif [ "${topology}" = "chatglm_6b" ]; then
alpha=0.75
model_name_or_path="THUDM/chatglm-6b"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "falcon_7b" ]; then
alpha=0.7
model_name_or_path="tiiuae/falcon-7b-instruct"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
pip install transformers==4.33.3
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan_7b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan-7B"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
script="run_generation_sq.py"
pip install transformers==4.33
elif [ "${topology}" = "baichuan_13b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_7b" ]; then
alpha=0.85
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.33
script="run_generation_sq.py"
elif [ "${topology}" = "baichuan2_13b" ]; then
alpha=0.55
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.35.2
script="run_generation_sq.py"
elif [ "${topology}" = "qwen_7b" ]; then
alpha=0.9
model_name_or_path="Qwen/Qwen-7B"
model_name_or_path="Qwen/Qwen-7B-Chat"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
pip install transformers==4.35.2
script="run_generation_sq.py"
script="run_generation_sq.py"
elif [ "${topology}" = "mistral_7b" ]; then
alpha=0.8
model_name_or_path="Intel/neural-chat-7b-v3"
Expand All @@ -240,15 +220,13 @@ function run_tuning {
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.36.1
script="run_generation_sq.py"
elif [ "${topology}" = "phi_1_5b" ]; then
alpha=0.5
model_name_or_path="susnato/phi-1_5_dev"
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
extra_cmd=$extra_cmd" --trust_remote_code"
pip install transformers==4.36.1
script="run_generation_sq.py"
elif [ "${topology}" = "llama2_7b_gptq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,9 @@ def forward(
input_bs, input_len = input_ids.shape
if self.use_cache and past_key_values is None:
if model_type in IPEX_OPT_LLM_SUPPORTED:
if model_type == "llama" and transformers.__version__ >= "4.36":
past_key_values = generate_dummy_past_key_values_for_inference(
config=self.config, input_bs=input_bs
)
else:
past_key_values = generate_dummy_past_key_values_for_opt_llm(
config=self.config, input_bs=input_bs, num_beams=1
)
past_key_values = generate_dummy_past_key_values_for_opt_llm(
config=self.config, input_bs=input_bs, num_beams=1
)
else:
past_key_values = generate_dummy_past_key_values_for_inference(
config=self.config, input_bs=input_bs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -841,8 +841,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
model = model.float()
model.eval()
model_type = model.config.model_type.replace("_", "-")
if "llama" in model_type and transformers.__version__ >= "4.36.0":
quantization_config.ipex_opt_llm = False

logger.info("Applying SmoothQuant.")
# ipex.optimize_transformers
if quantization_config.ipex_opt_llm is None:
Expand All @@ -851,7 +850,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
logger.info(
"quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used."
)
logger.warning("The suggested transformers version is 4.35.2.")
logger.warning("The suggested transformers version is 4.38.1.")
else:
quantization_config.ipex_opt_llm = False
if quantization_config.ipex_opt_llm:
Expand Down Expand Up @@ -946,7 +945,7 @@ def collate_batch(batch):
)

last_ind.append(input_ids.shape[0] - 1)
if model_type in ["bloom", "qwen"]:
if model_type in ["bloom"]:
attention_mask = torch.ones(len(input_ids) + 1)
attention_mask[0] = 0
else:
Expand Down
51 changes: 47 additions & 4 deletions intel_extension_for_transformers/transformers/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import Optional, Tuple
from neural_compressor.utils import logger
from neural_compressor.utils.utility import LazyImport, CpuInfo
from intel_extension_for_transformers.tools.utils import is_ipex_available


CONFIG_NAME = "best_configure.yaml"
Expand All @@ -36,6 +37,8 @@
SAFE_WEIGHTS_NAME = "model.safetensors"
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"

if is_ipex_available():
import intel_extension_for_pytorch as ipex
torch = LazyImport("torch")

def str2bool(v):
Expand Down Expand Up @@ -300,8 +303,24 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
]
return tuple(past_key_values)


IPEX_OPT_LLM_SUPPORTED = {"gptj", "opt", "llama", "falcon", "chatglm", "baichuan"}
IPEX_OPT_LLM_SUPPORTED_DICT = {
"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"],
"2.3": [
"gptj",
"opt",
"llama",
"falcon",
"chatglm",
"baichuan",
"qwen",
"bloom",
"codegen",
"gptbigcode",
"t5",
"mixtral",
"mpt",
],
}

MODEL_TYPES_REQUIRING_POSITION_IDS = {
"codegen",
Expand All @@ -314,9 +333,32 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
"llama",
"mistral",
"chatglm",
"baichuan"
}

if is_ipex_available() and ipex.__version__ == "2.2.0+cpu":
logger.info(
"ipex.llm.optimize by 2.2.0 version supported model family: {}".format(
",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])
)
)
logger.info(
"The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version."
)
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]
elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu":
logger.info(
"ipex.llm.optimize by 2.3.0 version supported model family: {}".format(
", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])
)
)
logger.info(
"The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version."
)
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
else:
logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.")
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]

def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4):
"""Generate the dummy example inputs."""
prompt = "Welcome to use Intel Extension for Transformers."
Expand Down Expand Up @@ -420,7 +462,8 @@ def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remot
(object): quantized model
"""
from transformers import AutoModelForCausalLM
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, trust_remote_code=trust_remote_code)
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path,
trust_remote_code=trust_remote_code).float()
if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED:
import intel_extension_for_pytorch as ipex
qconfig = ipex.quantization.default_static_qconfig_mapping
Expand Down

0 comments on commit 14734de

Please sign in to comment.