Skip to content

Commit

Permalink
Fixed UT and examples error
Browse files Browse the repository at this point in the history
Signed-off-by: Cheng, Penghui <[email protected]>
  • Loading branch information
PenghuiCheng committed Mar 17, 2024
1 parent 3ca3f60 commit 0cc7c00
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 44 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install cmake>=3.16
pip install py-cpuinfo
pip install torch==2.1.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-pytorch
pip install intel-extension-for-pytorch==2.1.100
- name: Test with Pytest
run: |
pytest tests/neural_compressor/
60 changes: 48 additions & 12 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM

os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
Expand Down Expand Up @@ -147,7 +149,9 @@ class OptimizationArguments:
)
quantization_approach: str = field(
default="dynamic",
metadata={"help": "Quantization approach. Supported approach are static, dynamic and aware_training."},
metadata={
"help": "Quantization approach. Supported approach are static, dynamic aware_training and weight_only."
},
)
smooth_quant: bool = field(
default=False,
Expand Down Expand Up @@ -200,8 +204,12 @@ class OptimizationArguments:
default=False,
metadata={"help": "Whether or not to verify the loading of the quantized model."},
)
bits: str = field(
default="4",
metadata={"help": "Bits number of weight for weight only quantization. 1~8 bits."},
)
weight_dtype: str = field(
default="int8",
default="int4_clip",
metadata={"help": "weight dtype for weight only quantization."},
)
group_size: int = field(
Expand All @@ -218,9 +226,24 @@ class OptimizationArguments:
)
quantization_methodology: str = field(
default="RTN",
metadata={
"help": "Quantization methodology for weight only quantization. Choose from 'RTN', 'AWQ' and 'GPTQ'."
},
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
)
gptq_percdamp: float = field(
default=0.01,
metadata={"help": "Percent of the average Hessian diagonal to use for dampening."},
)
gptq_block_size: int = field(
default=128,
metadata={"help": "Block size. sub weight matrix size to run GPTQ."},
)
gptq_nsamples: int = field(default=128, metadata={"help": "Number of calibration data samples."})
gptq_use_max_length: bool = field(
default=False,
metadata={"help": "Set all sequence length to be same length of args.gptq_pad_max_length"},
)
gptq_pad_max_length: int = field(
default=2048,
metadata={"help": "Calibration dataset sequence max length, this should align with your model config"},
)


Expand Down Expand Up @@ -636,11 +659,21 @@ def compute_metrics(eval_preds):
)
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
if optim_args.quantization_methodology == "GPTQ":
algorithm_args = {
"act_order": False,
"percdamp": optim_args.gptq_percdamp,
"block_size": optim_args.gptq_block_size,
"nsamples": optim_args.gptq_nsamples,
"use_max_length": optim_args.gptq_use_max_length,
"pad_max_length": optim_args.gptq_pad_max_length,
}
quantization_config = WeightOnlyQuantConfig(
weight_dtype=optim_args.weight_dtype,
group_size=optim_args.group_size,
scheme=optim_args.weight_only_scheme,
algorithm=optim_args.quantization_methodology,
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
)
else:
quantization_config = PostTrainingQuantConfig(
Expand Down Expand Up @@ -733,17 +766,20 @@ def compute_metrics(eval_preds):
quantizer.quantize(
quantization_config=quantization_config,
save_directory=training_args.output_dir,
calibration_dataset=train_dataset
if optim_args.quantization_approach in ["static", "weight_only"]
else None,
batch_size=1
if optim_args.quantization_approach == "weight_only"
else training_args.per_device_train_batch_size,
calibration_dataset=(
train_dataset if optim_args.quantization_approach in ["static", "weight_only"] else None
),
batch_size=(
1 if optim_args.quantization_approach == "weight_only" else training_args.per_device_train_batch_size
),
)
trainer.model = quantizer._quantized_model

if optim_args.apply_quantization and optim_args.verify_loading:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
if optim_args.quantization_approach == "weight_only":
loaded_model = ITREXAutoModelForCausalLM.from_pretrained(training_args.output_dir)
else:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
tokens = tokenizer("This is a sample input", return_tensors="pt")
with torch.no_grad():
original_model_outputs = trainer.model(**tokens)
Expand Down
6 changes: 3 additions & 3 deletions examples/neural_compressor/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ def calibration_fn(p_model):

args.length = adjust_length_to_model(
args.length,
max_sequence_length=model.config.max_position_embeddings
if hasattr(model.config, "max_position_embeddings")
else 0,
max_sequence_length=(
model.config.max_position_embeddings if hasattr(model.config, "max_position_embeddings") else 0
),
)
logger.info(args)

Expand Down
6 changes: 5 additions & 1 deletion optimum/intel/neural_compressor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from ..utils.import_utils import is_diffusers_available
from ..utils.import_utils import is_diffusers_available, is_intel_extension_for_transformers_available
from .configuration import INCConfig
from .modeling_base import (
INCModel,
Expand All @@ -32,3 +32,7 @@

if is_diffusers_available():
from .modeling_diffusion import INCStableDiffusionPipeline


if is_intel_extension_for_transformers_available():
from .modeling_base import ITREXAutoModelForCausalLM
4 changes: 2 additions & 2 deletions optimum/intel/neural_compressor/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class INCConfig(BaseConfig):

def __init__(
self,
quantization: Optional[Union[Dict, _BaseQuantizationConfig, "WeightOnlyQuantConfig"]] = None,
quantization: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
pruning: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
distillation: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
save_onnx_model: bool = False,
Expand All @@ -50,7 +50,7 @@ def __init__(
self.save_onnx_model = save_onnx_model

@staticmethod
def _create_quantization_config(config):
def _create_quantization_config(config: Union[Dict, _BaseQuantizationConfig]):
# TODO : add activations_dtype and weights_dtype
if isinstance(config, _BaseQuantizationConfig):
approach = _quantization_model[config.approach]
Expand Down
10 changes: 9 additions & 1 deletion optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from optimum.intel.generation import BaseModelForCausalLM

from ...modeling_base import OptimizedModel
from ..utils.import_utils import _torch_version, is_torch_version
from ..utils.import_utils import _torch_version, is_intel_extension_for_transformers_available, is_torch_version
from .configuration import INCConfig
from .utils import WEIGHTS_NAME

Expand All @@ -63,6 +63,14 @@
"""


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL

class ITREXAutoModelForCausalLM(ITREX_WOQ_MODEL):
auto_model_class = AutoModelForCausalLM
export_feature = "text-generation"


class INCModel(OptimizedModel):
auto_model_class = AutoModel
export_feature = "feature-extraction"
Expand Down
30 changes: 26 additions & 4 deletions optimum/intel/neural_compressor/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import copy
import inspect
import logging
import types
import warnings
from enum import Enum
from itertools import chain
Expand Down Expand Up @@ -79,6 +80,7 @@

if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

Config = Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]
Expand Down Expand Up @@ -185,6 +187,9 @@ def quantize(
save_directory = Path(save_directory)
save_directory.mkdir(parents=True, exist_ok=True)
save_onnx_model = kwargs.pop("save_onnx_model", False)
device = kwargs.pop("device", "cpu")
use_cpu = True if device == torch.device("cpu") or device == "cpu" else False
use_xpu = True if (isinstance(device, torch.device) and device.type == "xpu") or device == "xpu" else False

if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only):
save_onnx_model = False
Expand Down Expand Up @@ -217,7 +222,10 @@ def quantize(
f"For weight-only quantization, `quantization_config` should be an instance of `WeightOnlyQuantConfig`, but got: {type(quantization_config)} instead."
)

if calibration_dataset is None and ("GPTQ" in algo or "AWQ" in algo):
if algo not in ["RTN", "GPTQ"]:
raise ValueError("Weight-only quantization is only support RTN and GPTQ algorithm now!")

if calibration_dataset is None and quantization_config.tokenizer is None and ("GPTQ" in algo):
raise ValueError(
"Weight-only quantization needs a calibration dataset for both GPTQ and AWQ methodologies."
)
Expand Down Expand Up @@ -278,10 +286,24 @@ def quantize(
)

if not isinstance(quantization_config, PostTrainingQuantConfig):
self._quantized_model = convert_to_quantized_model(self._original_model, quantization_config)
if use_cpu:
# will remove after intel-extension-for-transformers 1.3.3 released
quantization_config.device = "cpu"
quantization_config.post_init()
elif use_xpu:
# will remove after intel-extension-for-transformers 1.3.3 released
quantization_config.device = "xpu"
quantization_config.post_init_xpu()
self._quantized_model = convert_to_quantized_model(
self._original_model, quantization_config, device=quantization_config.device
)
# will remove after intel-extension-for-transformers 1.3.3 released
if hasattr(quantization_config, "calib_dataloader"):
quantization_config.calib_dataloader = None
self._quantized_model.quantization_config = quantization_config
self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model)
# Save the quantized model
output_path = save_directory.joinpath(file_name or default_name)
self._quantized_model.save_pretrained(output_path)
self._quantized_model.save_pretrained(save_directory)
else:
if isinstance(self._original_model.config, PretrainedConfig):
self._original_model.config.backend = quantization_config.backend
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"rjieba",
"timm",
"invisible-watermark>=0.2.0",
"cmake>=3.16",
# Will remove after intel-extension-for-transformers 1.3.3 released.
"intel-extension-for-transformers>=1.3",
"peft",
"auto-gptq",
Expand Down
37 changes: 18 additions & 19 deletions tests/neural_compressor/test_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
set_seed,
)
from utils_tests import SEED, INCTestMixin, _generate_dataset
from optimum.intel.utils.import_utils import is_torch_version
from optimum.intel.utils.import_utils import is_torch_version, is_intel_extension_for_transformers_available


from optimum.intel import (
Expand All @@ -60,11 +60,13 @@
INCSeq2SeqTrainer,
INCStableDiffusionPipeline,
)
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
from optimum.intel.utils.constant import DIFFUSION_WEIGHTS_NAME
from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification
from optimum.pipelines import ORT_SUPPORTED_TASKS

if is_intel_extension_for_transformers_available():
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""
set_seed(SEED)
Expand Down Expand Up @@ -200,63 +202,60 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec
load_ipex_model=True,
)

@unittest.skipIf(
not is_intel_extension_for_transformers_available(), reason="Intel-extension-for-transformers not available!"
)
def test_weight_only_quantization(self):
model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)

with tempfile.TemporaryDirectory() as tmp_dir:
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
quantization_config = WeightOnlyQuantConfig(weight_dtype="int8")
q_model = quantizer.quantize(
quantization_config=quantization_config,
save_directory=tmp_dir,
)
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
out = model(inp)[0]
q_out = q_model(inp)[0]
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))

with tempfile.TemporaryDirectory() as tmp_dir:
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
quantization_config = WeightOnlyQuantConfig(
algorithm="GPTQ",
algorithm_args={
"percdamp": 0.01,
"act_order": False,
"scheme": "sym",
},
weight_dtype="int4_clip",
)
q_model = quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=tmp_dir,
)
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
out = model(inp)[0]
q_out = q_model(inp)[0]
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))

with tempfile.TemporaryDirectory() as tmp_dir:
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
quantization_config = WeightOnlyQuantConfig(
algorithm="AWQ",
weight_dtype="int4_clip",
)
q_model = quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=tmp_dir,
)
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
out = model(inp)[0]
q_out = q_model(inp)[0]
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))

with tempfile.TemporaryDirectory() as tmp_dir:
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
q_model = quantizer.quantize(
weight_only=True, # use RTN quantization method and NF4 weight data type is default.
save_directory=tmp_dir,
)
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
out = model(inp)[0]
q_out = q_model(inp)[0]
Expand Down
1 change: 1 addition & 0 deletions tests/openvino/test_modeling_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
This test is meant to run quickly with tiny test models. More extensive tests are in
test_modeling.py.
"""

# ruff: noqa

import gc
Expand Down

0 comments on commit 0cc7c00

Please sign in to comment.