diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index e31739b943..21cdeabfaa 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -67,13 +67,13 @@ class OptimizationTest(INCTestMixin): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( - ("text-classification", "hf-internal-testing/tiny-random-bert", 34), + ("text-classification", "hf-internal-testing/tiny-random-BertForSequenceClassification", 21), # ("text-generation", "hf-internal-testing/tiny-random-BloomForCausalLM", 1), # TODO : enable causal lm task once INC ONNX export fixed ) SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( - ("fill-mask", "hf-internal-testing/tiny-random-DistilBertForMaskedLM", 34), - ("token-classification", "hf-internal-testing/tiny-random-AlbertForTokenClassification", 34), + ("fill-mask", "hf-internal-testing/tiny-random-DistilBertForMaskedLM", 22), + ("token-classification", "hf-internal-testing/tiny-random-AlbertForTokenClassification", 26), ) TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( @@ -84,35 +84,45 @@ class OptimizationTest(INCTestMixin): @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls): quantization_config = PostTrainingQuantConfig(approach="dynamic") - model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) + model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) - quantizer = INCQuantizer.from_pretrained(model, task=task) save_onnx_model = False + quantized_model = None with tempfile.TemporaryDirectory() as tmp_dir: - quantizer.quantize( - quantization_config=quantization_config, - save_directory=tmp_dir, - save_onnx_model=save_onnx_model, - ) + for backend in ["torch", "ort"]: + if backend == "torch": + model = model_class.auto_model_class.from_pretrained(model_name) + else: + model = model_class.from_pretrained(model_name, export=True) + + quantizer = INCQuantizer.from_pretrained(model, task=task) + quantizer.quantize( + quantization_config=quantization_config, + save_directory=tmp_dir, + save_onnx_model=save_onnx_model, + ) + if backend == "torch": + quantized_model = quantizer._quantized_model + self.check_model_outputs( - q_model=quantizer._quantized_model, + q_model=quantized_model, task=task, tokenizer=tokenizer, save_directory=tmp_dir, expected_quantized_matmuls=expected_quantized_matmuls, is_static=False, - load_onnx_model=save_onnx_model, + load_onnx_model=True, + load_inc_model=True, ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_static_quantization(self, task, model_name, expected_quantized_matmuls): num_samples = 10 - model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) + model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - quantizer = INCQuantizer.from_pretrained(model, task=task) - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + save_onnx_model = False op_type_dict = ( {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} @@ -120,22 +130,35 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls) else None ) quantization_config = PostTrainingQuantConfig(approach="static", op_type_dict=op_type_dict) + quantized_model = None + with tempfile.TemporaryDirectory() as tmp_dir: - quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - save_onnx_model=save_onnx_model, - ) + for backend in ["torch", "ort"]: + if backend == "torch": + model = model_class.auto_model_class.from_pretrained(model_name) + else: + model = model_class.from_pretrained(model_name, export=True) + quantizer = INCQuantizer.from_pretrained(model, task=task) + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset, + save_directory=tmp_dir, + save_onnx_model=save_onnx_model, + ) + if backend == "torch": + quantized_model = quantizer._quantized_model + self.check_model_outputs( - q_model=quantizer._quantized_model, + q_model=quantized_model, task=task, tokenizer=tokenizer, save_directory=tmp_dir, expected_quantized_matmuls=expected_quantized_matmuls, is_static=True, + load_onnx_model=True, + load_inc_model=True, num_samples=num_samples, - load_onnx_model=save_onnx_model, ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index 34e699c186..521ea71e79 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -55,9 +55,10 @@ def num_quantized_matmul_onnx_model(onnx_model): num_quantized_matmul = 0 for node in onnx_model.graph.node: - if "quantizelinear" == node.op_type.lower(): + if "QuantizeLinear" in node.name: num_quantized_matmul += 1 - return num_quantized_matmul // 2 + + return num_quantized_matmul def _preprocess_function(examples, tokenizer, column_name): @@ -90,22 +91,31 @@ def check_model_outputs( expected_quantized_matmuls, is_static=True, load_onnx_model=True, + load_inc_model=True, num_samples=None, file_name=ONNX_WEIGHTS_NAME, ): tokens = tokenizer("This is a sample input", return_tensors="pt") - inc_model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(save_directory) + model_kwargs = ( {"decoder_file_name": file_name, "use_cache": False} if task == "text-generation" else {"file_name": file_name} ) inc_config = INCConfig.from_pretrained(save_directory) - self.assertEqual(inc_config.save_onnx_model, load_onnx_model) if num_samples is not None: self.assertEqual(inc_config.quantization["dataset_num_samples"], num_samples) + with torch.no_grad(): + model_outputs = q_model(**tokens) + outputs = model_outputs["logits"] if isinstance(model_outputs, dict) else model_outputs[0] + if load_inc_model: + inc_model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(save_directory) + inc_model_outputs = inc_model(**tokens) + self.assertTrue(torch.equal(outputs, inc_model_outputs["logits"])) + # self.assertEqual(inc_config.save_onnx_model, load_onnx_model) + if load_onnx_model: onnx_model = onnx_load(os.path.join(save_directory, file_name)) num_quantized_matmul = num_quantized_matmul_onnx_model(onnx_model) @@ -117,13 +127,7 @@ def check_model_outputs( ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs) ort_outputs = ort_model(**tokens) self.assertTrue("logits" in ort_outputs) - - with torch.no_grad(): - model_outputs = q_model(**tokens) - inc_model_outputs = inc_model(**tokens) - outputs = model_outputs["logits"] if isinstance(model_outputs, dict) else model_outputs[0] - self.assertTrue(torch.equal(outputs, inc_model_outputs["logits"])) - # self.assertTrue(torch.allclose(ort_outputs.logits, inc_model_outputs.logits, atol=1e-4)) + # self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-3)) @staticmethod def get_trainer(