Skip to content

Commit

Permalink
Load weight-only quantized model with INCModelForCausalLM
Browse files Browse the repository at this point in the history
Signed-off-by: Cheng, Penghui <[email protected]>
  • Loading branch information
PenghuiCheng committed Mar 24, 2024
1 parent f51266a commit f970272
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 16 deletions.
6 changes: 1 addition & 5 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@
if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM

os.environ["CUDA_VISIBLE_DEVICES"] = ""

Expand Down Expand Up @@ -777,10 +776,7 @@ def compute_metrics(eval_preds):
trainer.model = quantizer._quantized_model

if optim_args.apply_quantization and optim_args.verify_loading:
if optim_args.quantization_approach == "weight_only":
loaded_model = ITREXAutoModelForCausalLM.from_pretrained(training_args.output_dir)
else:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
tokens = tokenizer("This is a sample input", return_tensors="pt")
with torch.no_grad():
original_model_outputs = trainer.model(**tokens)
Expand Down
4 changes: 0 additions & 4 deletions optimum/intel/neural_compressor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,3 @@

if is_diffusers_available():
from .modeling_diffusion import INCStableDiffusionPipeline


if is_intel_extension_for_transformers_available():
from .modeling_base import ITREXAutoModelForCausalLM
31 changes: 26 additions & 5 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,7 @@

if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL

class ITREXAutoModelForCausalLM(ITREX_WOQ_MODEL):
auto_model_class = AutoModelForCausalLM
export_feature = "text-generation"

from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig

class INCModel(OptimizedModel):
auto_model_class = AutoModel
Expand Down Expand Up @@ -138,6 +134,31 @@ def _from_pretrained(
model_save_dir = Path(model_cache_path).parent
inc_config = None
msg = None
try:
quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id)
if getattr(quantization_config, "algorithm", None) is not None and quantization_config.algorithm.lower() in [
"rtn", "gptq", "awq", "autoaround"
]:
if not is_intel_extension_for_transformers_available():
raise ImportError(
"Didn't find out intel-etension-for-transformers package. "
"Please install packages: pip install intel-etension-for-transformers and pip install peft."
)
return ITREX_WOQ_MODEL.from_pretrained(
pretrained_model_name_or_path=model_id,
use_auth_token=use_auth_token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
local_files_only=local_files_only,
subfolder=subfolder,
trust_remote_code=trust_remote_code,
**kwargs,
)
except EnvironmentError:
msg = (
"The model is not quantized with weight-only quantization."
)
try:
inc_config = INCConfig.from_pretrained(model_id)
if not is_torch_version("==", inc_config.torch_version):
Expand Down
3 changes: 1 addition & 2 deletions tests/neural_compressor/test_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
from optimum.pipelines import ORT_SUPPORTED_TASKS

if is_intel_extension_for_transformers_available():
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""
Expand Down Expand Up @@ -244,7 +243,7 @@ def test_weight_only_quantization(self, no_config, algo, weight_dtype):
weight_only=True, # use RTN quantization method and NF4 weight data type is default.
save_directory=tmp_dir,
)
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
q_model = INCModelForCausalLM.from_pretrained(tmp_dir)
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
out = model(inp)[0]
q_out = q_model(inp)[0]
Expand Down

0 comments on commit f970272

Please sign in to comment.