From 806fa0bc03d2de0d424927f8ec6219733b8e750d Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 14 Jun 2024 02:42:10 -0700 Subject: [PATCH] aimprove deployment examples Signed-off-by: changwangss --- .../quantization/run_tuning.sh | 2 +- .../language-modeling/quantization/run_clm.py | 20 +-- .../language-modeling/quantization/run_mlm.py | 21 +-- .../language-modeling/quantization/run_plm.py | 20 +-- .../pytorch/optimization_README.md | 2 +- .../squad/bert_large/run_bert_large.sh | 2 +- .../deployment/squad/bert_large/run_qa.py | 56 +++++-- .../squad/ipex/bert_large/run_bert_large.sh | 2 +- .../squad/ipex/bert_large/run_qa.py | 25 ++- .../distilbert_base_uncased/run_distilbert.sh | 2 +- .../ipex/distilbert_base_uncased/run_qa.py | 26 ++- .../run_distilbert_sparse.sh | 2 +- .../distilbert_base_uncased_sparse/run_qa.py | 25 ++- .../length_adaptive_transformer/README.md | 2 +- .../length_adaptive_transformer/run_LAT.sh | 2 +- .../length_adaptive_transformer/run_qa.py | 23 ++- .../distilbert_base_uncased/run_emotion.py | 56 +++++-- .../distilbert_base_uncased/run_emotion.sh | 2 +- .../mrpc/bert_base/run_bert_base.sh | 2 +- .../deployment/mrpc/bert_base/run_glue.py | 48 ++++-- .../bert_base_cased/run_bert_base_cased.sh | 2 +- .../mrpc/bert_base_cased/run_glue.py | 48 ++++-- .../mrpc/bert_mini/run_bert_mini.sh | 2 +- .../deployment/mrpc/bert_mini/run_glue.py | 49 ++++-- .../run_distilbert_base.sh | 2 +- .../mrpc/distilbert_base_uncased/run_glue.py | 49 ++++-- .../deployment/mrpc/roberta_base/run_glue.py | 49 ++++-- .../mrpc/roberta_base/run_roberta_base.sh | 2 +- .../sst2/bert_mini/run_bert_mini.sh | 2 +- .../deployment/sst2/bert_mini/run_glue.py | 49 ++++-- .../run_distilbert_base.sh | 2 +- .../sst2/distilbert_base_uncased/run_glue.py | 49 ++++-- .../sst2/minilm_l6_h384_uncased/run_glue.py | 48 ++++-- .../sst2/minilm_l6_h384_uncased/run_minilm.sh | 2 +- .../deployment/mteb/bge/run_bge.sh | 2 +- .../deployment/mteb/bge/run_mteb.py | 50 ++++-- .../quantization/run_tuning.sh | 2 +- .../text2text-generation/run_tuning.sh | 2 +- tests/CI/test_quantization.py | 155 +++++------------- workflows/dlsa/run_dlsa.py | 14 +- .../src/infer_itrex.py | 14 +- 41 files changed, 502 insertions(+), 432 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh index 524a5e8d46d..0a793301dbb 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="bigcode/starcoder" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" alpha=0.5 script="run_generation.py" for var in "$@" diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py index 883d2468c68..9a9637785ec 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py @@ -583,29 +583,23 @@ def compute_metrics(eval_preds): greater_is_better=False ) trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) if optim_args.quantization_approach != "qat": - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = PostTrainingQuantConfig( approach=optim_args.quantization_approach, tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion ) else: - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = QuantizationAwareTrainingConfig( tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion - ) + ) early_stopping_patience = 2 early_stopping_threshold = 0.001 # optional trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py index 15bdc445990..ea78bdd0e08 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py @@ -602,29 +602,24 @@ def compute_metrics(eval_preds): criterion=optim_args.perf_tol, greater_is_better=False ) + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) if optim_args.quantization_approach != "qat": - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = PostTrainingQuantConfig( approach=optim_args.quantization_approach, tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion ) else: - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = QuantizationAwareTrainingConfig( tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion - ) + ) early_stopping_patience = 2 early_stopping_threshold = 0.001 # optional trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py index be104b5a701..4550de43e69 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py @@ -549,29 +549,23 @@ def group_texts(examples): greater_is_better=False ) trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) if optim_args.quantization_approach != "qat": - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = PostTrainingQuantConfig( approach=optim_args.quantization_approach, tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion ) else: - tuning_criterion = TuningCriterion(max_trials=600) - accuracy_criterion = AccuracyCriterion( - higher_is_better=False, # optional. - criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". - tolerable_loss=optim_args.perf_tol, # optional. - ) quantization_config = QuantizationAwareTrainingConfig( tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion - ) + ) early_stopping_patience = 2 early_stopping_threshold = 0.001 # optional trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ diff --git a/examples/huggingface/pytorch/optimization_README.md b/examples/huggingface/pytorch/optimization_README.md index 2bafa814908..3d1909b01a6 100644 --- a/examples/huggingface/pytorch/optimization_README.md +++ b/examples/huggingface/pytorch/optimization_README.md @@ -4,7 +4,7 @@ Welcome to Pytorch Huggingface examples. The examples is following from [Hugging ## Quantization approach -| Task | PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +| Task | dynamic | static | qat |---|:---:|:---:|:---:| |**`language-modeling`**| ✅ | ✅ | ✅ |**`multi-choice`**| ✅ | ✅ | ✅ diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_bert_large.sh b/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_bert_large.sh index 5c65ca50e0b..017b7a2efd3 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_bert_large.sh +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_bert_large.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_qa.py b/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_qa.py index 4ce35df134d..39bb8c074ac 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/bert_large/run_qa.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics , OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -211,9 +217,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -646,25 +652,43 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: @@ -674,7 +698,7 @@ def compute_metrics(p: EvalPrediction): max_eval_samples = data_args.max_eval_samples \ if data_args.max_eval_samples is not None else len(eval_dataset) eval_samples = min(max_eval_samples, len(eval_dataset)) - samples = eval_samples - (eval_samples % batch_size) \ + samples = eval_samples - (eval_samples % training_args.per_device_eval_batch_size) \ if training_args.dataloader_drop_last else eval_samples logger.info("metrics keys: {}".format(results.keys())) bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation', diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_bert_large.sh b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_bert_large.sh index 7f3253746ab..0d93e12079b 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_bert_large.sh +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_bert_large.sh @@ -121,7 +121,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'fp32' ]]; then mode_cmd=$mode_cmd" --fp32" fi diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_qa.py b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_qa.py index d34418e0dd8..46122402c6e 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/run_qa.py @@ -26,7 +26,8 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics , OptimizedModel +from neural_compressor.config import PostTrainingQuantConfig from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -211,9 +212,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -640,27 +641,21 @@ def compute_metrics(p: EvalPrediction): trainer.save_model(training_args.output_dir) trainer.calib_dataloader = trainer.get_eval_dataloader() - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( + trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + backend="ipex", approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - use_bf16=False + excluded_precisions=["bf16"] ) - quantization_config.framework = "pytorch_ipex" model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_distilbert.sh b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_distilbert.sh index 1c0ca172eba..d0d2bbb9db7 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_distilbert.sh +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_distilbert.sh @@ -121,7 +121,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'fp32' ]]; then mode_cmd=$mode_cmd" --fp32" fi diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_qa.py b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_qa.py index 9417cf4f65f..46122402c6e 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased/run_qa.py @@ -26,7 +26,8 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics , OptimizedModel +from neural_compressor.config import PostTrainingQuantConfig from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -211,9 +212,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -640,28 +641,21 @@ def compute_metrics(p: EvalPrediction): trainer.save_model(training_args.output_dir) trainer.calib_dataloader = trainer.get_eval_dataloader() - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - - elif optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( + trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + backend="ipex", approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - use_bf16=False + excluded_precisions=["bf16"] ) - quantization_config.framework = "pytorch_ipex" model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_distilbert_sparse.sh b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_distilbert_sparse.sh index 3e82836a0d2..7621827eef9 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_distilbert_sparse.sh +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_distilbert_sparse.sh @@ -121,7 +121,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'fp32' ]]; then mode_cmd=$mode_cmd" --fp32" fi diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_qa.py b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_qa.py index 25c51b01192..1c250201c9d 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/run_qa.py @@ -26,7 +26,8 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics , OptimizedModel +from neural_compressor.config import PostTrainingQuantConfig from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -211,9 +212,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -640,27 +641,21 @@ def compute_metrics(p: EvalPrediction): trainer.save_model(training_args.output_dir) trainer.calib_dataloader = trainer.get_eval_dataloader() - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - elif optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( + trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + backend="ipex", approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - use_bf16=False + excluded_precisions=["bf16"], ) - quantization_config.framework = "pytorch_ipex" model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/README.md b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/README.md index d7082e1194a..98f9050cdc5 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/README.md +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/README.md @@ -76,7 +76,7 @@ python run_qa.py --model_name_or_path "sguskin/dynamic-minilmv2-L6-H384-squad1.1 For INT8: ```shell -python run_qa.py --model_name_or_path "sguskin/dynamic-minilmv2-L6-H384-squad1.1" --dataset_name squad --do_train --do_eval --output_dir model_and_tokenizer --overwrite_output_dir --length_config "(269, 253, 252, 202, 104, 34)" --overwrite_cache --to_onnx --tune --quantization_approach PostTrainingStatic +python run_qa.py --model_name_or_path "sguskin/dynamic-minilmv2-L6-H384-squad1.1" --dataset_name squad --do_train --do_eval --output_dir model_and_tokenizer --overwrite_output_dir --length_config "(269, 253, 252, 202, 104, 34)" --overwrite_cache --to_onnx --tune --quantization_approach static ``` For BF16: diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_LAT.sh b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_LAT.sh index 163c8894cf7..a2119fd1dc3 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_LAT.sh +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_LAT.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_qa.py b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_qa.py index eb41bf85a32..81209b7d4f7 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/length_adaptive_transformer/run_qa.py @@ -29,7 +29,8 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig, DynamicLengthConfig +from intel_extension_for_transformers.transformers import metrics , OptimizedModel, DynamicLengthConfig +from neural_compressor.config import PostTrainingQuantConfig from trainer_qa import QuestionAnsweringTrainer from intel_extension_for_transformers.transformers.modeling.modeling_roberta_dynamic import RobertaForQuestionAnswering @@ -221,9 +222,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -780,24 +781,20 @@ def compute_metrics(p: EvalPrediction): trainer.save_model(training_args.output_dir) trainer.calib_dataloader = trainer.get_eval_dataloader() - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( + trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + backend="ipex", approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], + excluded_precision=["bf16"] ) model = trainer.quantize(quant_config=quantization_config) diff --git a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.py b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.py index 720e4109657..d599c564647 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.py +++ b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -198,9 +204,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -521,27 +527,43 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.sh b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.sh index 57cee9c4494..79c91ee5b18 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/run_emotion.sh @@ -121,7 +121,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_bert_base.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_bert_base.sh index f2e133f1d9e..d1648a0c661 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_bert_base.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_bert_base.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_glue.py index eb8e47583e0..7267b845f98 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,37 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_bert_base_cased.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_bert_base_cased.sh index 4daabbfe41a..cca1268e646 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_bert_base_cased.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_bert_base_cased.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py index 9374620302a..c68d3527abc 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,37 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_bert_mini.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_bert_mini.sh index 6e9db50fddd..fa9f005fa54 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_bert_mini.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_bert_mini.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_glue.py index eb8e47583e0..6e3bc04cd0d 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,36 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_distilbert_base.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_distilbert_base.sh index 8c75385ea5b..b20cffed3b5 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_distilbert_base.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_distilbert_base.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_glue.py index eb8e47583e0..6e3bc04cd0d 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,36 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_glue.py index efc762b5c59..6f899f98ed4 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,36 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_roberta_base.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_roberta_base.sh index 37110ee356c..4228e299003 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_roberta_base.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/run_roberta_base.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_bert_mini.sh b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_bert_mini.sh index 4a5a986fdc1..d756a945cf6 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_bert_mini.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_bert_mini.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_glue.py index efc762b5c59..6f899f98ed4 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,36 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_distilbert_base.sh b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_distilbert_base.sh index d2be2d43c50..962a0044ca0 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_distilbert_base.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_distilbert_base.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_glue.py index eb8e47583e0..6e3bc04cd0d 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,36 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_glue.py index 4400593f0c2..9eaa0cb0ecd 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_glue.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -197,9 +203,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -524,27 +530,37 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_minilm.sh b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_minilm.sh index 20417a96738..31f6033ce6e 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_minilm.sh +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/run_minilm.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_bge.sh b/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_bge.sh index 8c24dbcffef..5b62734d906 100644 --- a/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_bge.sh +++ b/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_bge.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_mteb.py b/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_mteb.py index 557524c6590..225c8012b34 100644 --- a/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_mteb.py +++ b/examples/huggingface/pytorch/text-embedding/deployment/mteb/bge/run_mteb.py @@ -28,7 +28,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( @@ -201,9 +207,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -571,28 +577,36 @@ def preprocess_function(example): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective], - sampling_size = len(train_dataset)//20 + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) stmodel = SentenceTransformer(model_args.model_name_or_path) def eval_func(model): diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 16eaaa3182e..7c3919a132a 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="EleutherAI/gpt-j-6b" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" script="run_generation_sq.py" alpha=0.5 weight_dtype="int4" diff --git a/examples/huggingface/pytorch/text2text-generation/run_tuning.sh b/examples/huggingface/pytorch/text2text-generation/run_tuning.sh index 3d35086f578..826469b1b3c 100644 --- a/examples/huggingface/pytorch/text2text-generation/run_tuning.sh +++ b/examples/huggingface/pytorch/text2text-generation/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="google/flan-t5-large" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" alpha=0.7 for var in "$@" do diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index e3c2799e94f..dce618d7845 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -23,12 +23,15 @@ import unittest from intel_extension_for_transformers.transformers import ( metrics, - objectives, OptimizedModel, - QuantizationConfig, - QuantizationMode, NoTrainerOptimizer, ) +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer from intel_extension_for_transformers.transformers.utils import CpuInfo @@ -107,116 +110,44 @@ def tearDownClass(self): def test_fx_model_quant(self): fp32_output = self.trainer.predict(self.dummy_dataset).predictions - for mode in QuantizationMode: - print("Quantization approach:", mode.value) - self.trainer = NLPTrainer( - model=self.model, - train_dataset=self.dummy_dataset, - eval_dataset=self.dummy_dataset, - ) - - # Check fp32 jit and onnx model, only once. - if mode == QuantizationMode.POSTTRAININGSTATIC: - jit_model = self.trainer.export_to_jit() - self.trainer.export_to_onnx('fp32-model.onnx') - self.assertTrue(check_onnx('fp32-model.onnx', self.trainer.get_eval_dataloader())) - - self.trainer.benchmark(num_of_instance=1) - tune_metric = metrics.Metric( - name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 - ) - quantization_config = QuantizationConfig( - approach=mode.name, - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = self.trainer.quantize(quant_config=quantization_config, provider="inc") - self.trainer.benchmark(self.trainer.args.output_dir, num_of_instance=1) - # By default, model will be saved into tmp_trainer dir. - self.trainer.save_model('./quantized_model') - - # Check int8 onnx model - if mode == QuantizationMode.POSTTRAININGSTATIC: - # test different configure to improve UT coverage - self.trainer.export_to_onnx( - save_path=None, - quant_format='Qlinear', - dtype='S8S8', - opset_version=13, - ) - self.assertTrue(check_onnx('./tmp_trainer/int8-model.onnx', self.trainer.get_eval_dataloader())) - else: - self.trainer.export_to_onnx('int8-model.onnx') - self.assertTrue(check_onnx('int8-model.onnx', self.trainer.get_eval_dataloader())) - - if mode == QuantizationMode.QUANTIZATIONAWARETRAINING: - model = onnx.load('int8-model.onnx') - tensor_list = {tensor.name:tensor for tensor in model.graph.initializer} - torch_data = quantized_model.classifier.state_dict()\ - ['module._packed_params._packed_params'][0].\ - dequantize().detach().cpu().numpy().T - from onnx.numpy_helper import to_array - onnx_data = to_array(tensor_list['classifier.weight_quantized']) - onnx_scale = to_array(tensor_list['classifier.weight_scale']) - self.assertTrue(np.allclose(torch_data, onnx_data * onnx_scale, atol=0.001)) - # Check quantized model - output_1 = self.trainer.predict(self.dummy_dataset).predictions - loaded_model = OptimizedModel.from_pretrained( - './quantized_model', - ) - self.trainer.model = loaded_model - output_2 = self.trainer.predict(self.dummy_dataset).predictions - self.assertTrue((fp32_output != output_1).any()) - - # check loaded model - self.assertTrue((output_1 == output_2).all()) - - def test_fx_model_with_smooth_quant(self): - def eval_func(model): - return 1 - - def train_func(model): - return model - - trainer = NLPTrainer( + self.trainer = NLPTrainer( model=self.model, train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset, ) - tune_metric = metrics.Metric( - name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 + jit_model = self.trainer.export_to_jit() + self.trainer.export_to_onnx('fp32-model.onnx') + self.assertTrue(check_onnx('fp32-model.onnx', self.trainer.get_eval_dataloader())) + + self.trainer.benchmark(num_of_instance=1) + + quantization_config = PostTrainingQuantConfig( + approach="static", ) - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes={"smooth_quant": True, - "smooth_quant_args": {"alpha": 0.6}, - } + quantized_model = self.trainer.quantize(quant_config=quantization_config, provider="inc") + self.trainer.benchmark(self.trainer.args.output_dir, num_of_instance=1) + # By default, model will be saved into tmp_trainer dir. + self.trainer.save_model('./quantized_model') + # test different configure to improve UT coverage + self.trainer.export_to_onnx( + save_path=None, + quant_format='Qlinear', + dtype='S8S8', + opset_version=13, ) - recipes = quantization_config.recipes - self.assertTrue(recipes["smooth_quant"]) - quantized_model = trainer.quantize(quant_config=quantization_config) - self.assertTrue("quantize" in str(type(quantized_model.classifier.module))) - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes={} + self.assertTrue(check_onnx('./tmp_trainer/int8-model.onnx', self.trainer.get_eval_dataloader())) + # Check quantized model + output_1 = self.trainer.predict(self.dummy_dataset).predictions + loaded_model = OptimizedModel.from_pretrained( + './quantized_model', ) - quantized_model = trainer.quantize(quant_config=quantization_config, - train_func=train_func, - eval_func=eval_func) - self.assertTrue("quantize" in str(type(quantized_model.classifier.module))) - - with self.assertRaises(ValueError): - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes=[] - ) + self.trainer.model = loaded_model + output_2 = self.trainer.predict(self.dummy_dataset).predictions + self.assertTrue((fp32_output != output_1).any()) + + # check loaded model + self.assertTrue((output_1 == output_2).all()) def test_functional_quant(self): def eval_func(model): @@ -226,9 +157,8 @@ def train_func(model): return model self.trainer = NLPTrainer(self.model, train_dataset=self.dummy_dataset) - quantization_config = QuantizationConfig( - approach='PostTrainingStatic', - objectives=[objectives.performance] + quantization_config = PostTrainingQuantConfig( + approach='static', ) self.trainer.quantize(quant_config=quantization_config, provider="inc", @@ -242,13 +172,8 @@ def eval_func(model): def train_func(model): return model - tune_metric = metrics.Metric( - name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 - ) - quantization_config = QuantizationConfig( - approach='PostTrainingStatic', - metrics=[tune_metric], - objectives=[objectives.performance] + quantization_config = PostTrainingQuantConfig( + approach='static', ) self.optimizer.eval_func = eval_func self.optimizer.train_func = train_func diff --git a/workflows/dlsa/run_dlsa.py b/workflows/dlsa/run_dlsa.py index 583e37847ea..92d6827998d 100644 --- a/workflows/dlsa/run_dlsa.py +++ b/workflows/dlsa/run_dlsa.py @@ -39,10 +39,10 @@ ) from intel_extension_for_transformers.transformers import ( OptimizedModel, - QuantizationConfig, metrics, objectives, ) +from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion from intel_extension_for_transformers.transformers.trainer import NLPTrainer hf_logging.set_verbosity_info() @@ -288,12 +288,12 @@ def preprocess(examples): if args.do_quantize: with track("Quantize"): metric = metrics.Metric(name="eval_acc", is_relative=True, criterion=0.01) - q_config = QuantizationConfig( - framework="pytorch_ipex", - approach="PostTrainingStatic", - max_trials=200, # set the Max tune times - metrics=[metric], - objectives=[objectives.performance], + trainer.metrics = metric + tuning_criterion = TuningCriterion(max_trials=600) + q_config = PostTrainingQuantConfig( + backend="ipex", + approach="static", + tuning_criterion=tuning_criterion ) def eval_func(model): diff --git a/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py b/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py index b666c6f8bbc..3b6c743c485 100644 --- a/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py +++ b/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py @@ -17,7 +17,6 @@ import torch from os import path from intel_extension_for_transformers.transformers import ( - QuantizationConfig, metrics, objectives, ) @@ -29,7 +28,7 @@ DataCollatorWithPadding, Trainer, ) - +from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion from infer import DlsaInference from utils import PredsLabels, compute_metrics, save_performance_metrics @@ -76,12 +75,11 @@ def _load_model(self): ) metric = metrics.Metric(name="eval_acc", is_relative=True, criterion=0.03) - q_config = QuantizationConfig( - framework="pytorch", - approach="PostTrainingStatic", - max_trials=200, # set the Max tune times - metrics=[metric], - objectives=[objectives.performance], + self.trainer.metrics = metric + tuning_criterion = TuningCriterion(max_trials=200) + q_config = PostTrainingQuantConfig( + approach="static", + tuning_criterion=tuning_criterion, ) eval_dataloader = self.trainer.get_eval_dataloader() self.model = self.trainer.quantize(