diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index f3398858a7..63ceb75158 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install cmake>=3.16 pip install py-cpuinfo - pip install torch==2.1.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] pip install intel-extension-for-pytorch==2.1.100 - name: Test with Pytest diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 7b294a55ec..d5ff782db3 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -287,17 +287,17 @@ def quantize( if not isinstance(quantization_config, PostTrainingQuantConfig): if use_cpu: - # will remove after intel-extension-for-transformers 1.3.3 released + # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "cpu" quantization_config.post_init() elif use_xpu: - # will remove after intel-extension-for-transformers 1.3.3 released + # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "xpu" quantization_config.post_init_xpu() self._quantized_model = convert_to_quantized_model( self._original_model, quantization_config, device=quantization_config.device ) - # will remove after intel-extension-for-transformers 1.3.3 released + # will remove after intel-extension-for-transformers 1.3.3 release. if hasattr(quantization_config, "calib_dataloader"): quantization_config.calib_dataloader = None self._quantized_model.quantization_config = quantization_config diff --git a/setup.py b/setup.py index 2a125597df..6452da43b1 100644 --- a/setup.py +++ b/setup.py @@ -49,9 +49,6 @@ "rjieba", "timm", "invisible-watermark>=0.2.0", - # Will remove after intel-extension-for-transformers 1.3.3 released. - "intel-extension-for-transformers>=1.3", - "peft", "auto-gptq", "transformers_stream_generator", "einops", @@ -60,7 +57,14 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], + "neural-compressor": [ + "neural-compressor>=2.2.0", + "onnxruntime<1.15.0", + "accelerate", + # will remove after intel-extension-for-transformers 1.3.3 release. + "intel-extension-for-transformers>=1.3", + "peft", + ], "openvino": ["openvino>=2023.3", "nncf>=2.8.1"], "openvino-tokenizers": ["openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 260cb97270..026138553c 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -88,6 +88,13 @@ class OptimizationTest(INCTestMixin): "hf-internal-testing/tiny-random-GPTNeoForCausalLM", ) + WEIGHT_ONLY_CONFIG = ( + (False, "RTN", "int4_clip"), + (False, "GPTQ", "int4_clip"), + (False, "RTN", "int8"), + (True, "", ""), + ) + @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls): quantization_config = PostTrainingQuantConfig(approach="dynamic") @@ -202,59 +209,41 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec load_ipex_model=True, ) + @parameterized.expand(WEIGHT_ONLY_CONFIG) @unittest.skipIf( not is_intel_extension_for_transformers_available(), reason="Intel-extension-for-transformers not available!" ) - def test_weight_only_quantization(self): + def test_weight_only_quantization(self, no_config, algo, weight_dtype): model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) with tempfile.TemporaryDirectory() as tmp_dir: - quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) - quantization_config = WeightOnlyQuantConfig(weight_dtype="int8") - q_model = quantizer.quantize( - quantization_config=quantization_config, - save_directory=tmp_dir, - ) - q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir) - inp = torch.tensor([calibration_dataset[0]["input_ids"]]) - out = model(inp)[0] - q_out = q_model(inp)[0] - self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1))) - - with tempfile.TemporaryDirectory() as tmp_dir: - quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) - quantization_config = WeightOnlyQuantConfig( - algorithm="GPTQ", - algorithm_args={ - "percdamp": 0.01, - "act_order": False, - "scheme": "sym", - }, - weight_dtype="int4_clip", - ) - q_model = quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - ) - q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir) - inp = torch.tensor([calibration_dataset[0]["input_ids"]]) - out = model(inp)[0] - q_out = q_model(inp)[0] - self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1))) - - with tempfile.TemporaryDirectory() as tmp_dir: - quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) - q_model = quantizer.quantize( - weight_only=True, # use RTN quantization method and NF4 weight data type is default. - save_directory=tmp_dir, - ) + if not no_config: + if algo == "GPTQ": + algorithm_args = { + "percdamp": 0.01, + "act_order": False, + "scheme": "sym", + } + quantization_config = WeightOnlyQuantConfig( + algorithm=algo, + algorithm_args=algorithm_args if algo == "GPTQ" else None, + weight_dtype=weight_dtype, + ) + q_model = quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset if algo == "GPTQ" else None, + save_directory=tmp_dir, + ) + else: + q_model = quantizer.quantize( + weight_only=True, # use RTN quantization method and NF4 weight data type is default. + save_directory=tmp_dir, + ) q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir) inp = torch.tensor([calibration_dataset[0]["input_ids"]]) out = model(inp)[0]