diff --git a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb
index accba939b30..a70dd5a1ab5 100644
--- a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb
+++ b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb
@@ -74,13 +74,15 @@
     "import transformers\n",
     "from intel_extension_for_transformers.transformers import (\n",
     "    metrics,\n",
-    "    PrunerConfig,\n",
-    "    PruningConfig,\n",
-    "    DistillationConfig,\n",
-    "    QuantizationConfig,\n",
     "    OptimizedModel,\n",
     "    objectives\n",
     ")\n",
+    "from neural_compressor.config import (\n",
+    "    WeightPruningConfig,\n",
+    "    DistillationConfig,\n",
+    "    KnowledgeDistillationLossConfig,\n",
+    "    QuantizationAwareTrainingConfig,\n",
+    ")\n",
     "from torch.utils.data import DataLoader\n",
     "from tqdm import tqdm\n",
     "from trainer_qa import QuestionAnsweringTrainer\n",
@@ -214,7 +216,7 @@
     "        metadata={\"help\": \"Whether or not to apply prune.\"},\n",
     "    )\n",
     "    pruning_approach: Optional[str] = field(\n",
-    "        default=\"BasicMagnitude\",\n",
+    "        default=\"magnitude\",\n",
     "        metadata={\"help\": \"Pruning approach. Supported approach is basic_magnite.\"},\n",
     "    )\n",
     "    target_sparsity_ratio: Optional[float] = field(\n",
@@ -234,9 +236,9 @@
     "        metadata={\"help\": \"Whether or not to apply quantization.\"},\n",
     "    )\n",
     "    quantization_approach: Optional[str] = field(\n",
-    "        default=\"PostTrainingStatic\",\n",
-    "        metadata={\"help\": \"Quantization approach. Supported approach are PostTrainingStatic, \"\n",
-    "                  \"PostTrainingDynamic and QuantizationAwareTraining.\"},\n",
+    "        default=\"static\",\n",
+    "        metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n",
+    "                  \"dynamic and qat.\"},\n",
     "    )\n",
     "    metric_name: Optional[str] = field(\n",
     "        default=None,\n",
@@ -300,7 +302,7 @@
     ")\n",
     "optim_args = OptimizationArguments(\n",
     "    tune=True,\n",
-    "    quantization_approach=\"PostTrainingStatic\"\n",
+    "    quantization_approach=\"static\"\n",
     ")\n",
     "log_level = training_args.get_process_log_level()"
    ]
@@ -730,9 +732,7 @@
     "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n",
     "            para_counter(model)/10**6))\n",
     "\n",
-    "# Trace model\n",
-    "from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace\n",
-    "model = symbolic_trace(model, optim_args.quantization_approach==\"QuantizationAwareTraining\")"
+    "# Trace model\n"
    ]
   },
   {
@@ -779,21 +779,18 @@
     "    tune_metric = metrics.Metric(\n",
     "        name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol\n",
     "    )\n",
-    "    prune_type = 'PatternLock' \\\n",
+    "    prune_type = 'pattern_lock' \\\n",
     "        if optim_args.pruning_approach else optim_args.pruning_approach\n",
     "    target_sparsity_ratio = optim_args.target_sparsity_ratio \\\n",
     "        if optim_args.target_sparsity_ratio else None\n",
-    "    pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)\n",
-    "    pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n",
-    "    distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n",
-    "\n",
-    "    objective = objectives.performance\n",
-    "    quantization_conf = QuantizationConfig(\n",
-    "        approach=optim_args.quantization_approach,\n",
-    "        max_trials=600,\n",
-    "        metrics=[tune_metric],\n",
-    "        objectives=[objective]\n",
-    "    )\n",
+    "    trainer.metrics = tune_metric\n",
+    "    pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n",
+    "                                        target_sparsity=target_sparsity_ratio,\n",
+    "                                        pruning_scope=\"local\",\n",
+    "                                        pruning_type=prune_type)\n",
+    "    distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n",
+    "    distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n",
+    "    quantization_conf = QuantizationAwareTrainingConfig()\n",
     "    conf_list = [pruning_conf, distillation_conf, quantization_conf]\n",
     "    model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)"
    ]
diff --git a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb
index b3a983f8d35..78b1258d580 100644
--- a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb
+++ b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb
@@ -78,6 +78,12 @@
     "    DataCollatorWithPadding,\n",
     "    EvalPrediction,\n",
     ")\n",
+    "from neural_compressor.config import (\n",
+    "    WeightPruningConfig,\n",
+    "    DistillationConfig,\n",
+    "    KnowledgeDistillationLossConfig,\n",
+    "    QuantizationAwareTrainingConfig,\n",
+    ")\n",
     "from transformers.utils import check_min_version\n",
     "from transformers.utils.versions import require_version\n",
     "from typing import Optional\n",
@@ -430,18 +436,14 @@
     "    name=metric_name, is_relative=True, criterion=0.01\n",
     ")\n",
     "\n",
-    "target_sparsity_ratio = None\n",
-    "pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=None)\n",
-    "pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n",
-    "distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n",
-    "\n",
-    "objective = objectives.performance\n",
-    "quantization_conf = QuantizationConfig(\n",
-    "    approach=\"QuantizationAwareTraining\",\n",
-    "    max_trials=600,\n",
-    "    metrics=[tune_metric],\n",
-    "    objectives=[objective]\n",
-    ")\n",
+    "trainer.metrics = tune_metric\n",
+    "pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n",
+    "                                    target_sparsity=0.64,\n",
+    "                                    pruning_scope=\"local\",\n",
+    "                                    pruning_type=\"pattern_lock\")\n",
+    "distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n",
+    "distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n",
+    "quantization_conf = QuantizationAwareTrainingConfig()\n",
     "conf_list = [pruning_conf, distillation_conf, quantization_conf]\n",
     "model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)"
    ]
diff --git a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb
index e533ab555f9..ffbd067af30 100644
--- a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb
+++ b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb
@@ -70,12 +70,14 @@
     "from datasets import load_dataset, load_metric\n",
     "from intel_extension_for_transformers.transformers import (\n",
     "    metrics,\n",
-    "    PrunerConfig,\n",
-    "    PruningConfig,\n",
-    "    DistillationConfig,\n",
-    "    QuantizationConfig,\n",
     "    objectives\n",
     ")\n",
+    "from neural_compressor.config import (\n",
+    "    WeightPruningConfig,\n",
+    "    DistillationConfig,\n",
+    "    KnowledgeDistillationLossConfig,\n",
+    "    QuantizationAwareTrainingConfig,\n",
+    ")\n",
     "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n",
     "from transformers import (\n",
     "    AutoConfig,\n",
@@ -343,18 +345,14 @@
     "    name=metric_name, is_relative=True, criterion=0.01\n",
     ")\n",
     "\n",
-    "target_sparsity_ratio = None\n",
-    "pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=None)\n",
-    "pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n",
-    "distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n",
-    "\n",
-    "objective = objectives.performance\n",
-    "quantization_conf = QuantizationConfig(\n",
-    "    approach=\"QuantizationAwareTraining\",\n",
-    "    max_trials=600,\n",
-    "    metrics=[tune_metric],\n",
-    "    objectives=[objective]\n",
-    ")\n",
+    "trainer.metrics = tune_metric\n",
+    "pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n",
+    "                                    target_sparsity=0.64,\n",
+    "                                    pruning_scope=\"local\",\n",
+    "                                    pruning_type=\"pattern_lock\")\n",
+    "distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n",
+    "distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n",
+    "quantization_conf = QuantizationAwareTrainingConfig()\n",
     "conf_list = [pruning_conf, distillation_conf, quantization_conf]\n",
     "model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)"
    ]
diff --git a/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py b/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py
index c07e38affc1..b9e416d41b9 100644
--- a/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py
+++ b/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py
@@ -33,13 +33,15 @@
 import transformers
 from intel_extension_for_transformers.transformers import (
     metrics,
-    PrunerConfig,
-    PruningConfig,
-    DistillationConfig,
-    QuantizationConfig,
     OptimizedModel,
     objectives
 )
+from neural_compressor.config import (
+    WeightPruningConfig,
+    DistillationConfig,
+    KnowledgeDistillationLossConfig,
+    QuantizationAwareTrainingConfig,
+)
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from trainer_qa import QuestionAnsweringTrainer
@@ -225,7 +227,7 @@ class OptimizationArguments:
         metadata={"help": "Whether or not to apply prune."},
     )
     pruning_approach: Optional[str] = field(
-        default="BasicMagnitude",
+        default="magnitude",
         metadata={"help": "Pruning approach. Supported approach is basic_magnite."},
     )
     target_sparsity_ratio: Optional[float] = field(
@@ -245,9 +247,9 @@ class OptimizationArguments:
         metadata={"help": "Whether or not to apply quantization."},
     )
     quantization_approach: Optional[str] = field(
-        default="QuantizationAwareTraining",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                  "PostTrainingDynamic and QuantizationAwareTraining."},
+        default="qat",
+        metadata={"help": "Quantization approach. Supported approach are static, "
+                  "dynamic and qat."},
     )
     metric_name: Optional[str] = field(
         default="eval_f1",
@@ -789,7 +791,7 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset):
 
     # Trace model
     from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace
-    model = symbolic_trace(model, optim_args.quantization_approach=="QuantizationAwareTraining")
+    model = symbolic_trace(model, optim_args.quantization_approach=="qat")
 
     # Initialize our Trainer
     trainer = QuestionAnsweringTrainer(
@@ -814,23 +816,20 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset):
         tune_metric = metrics.Metric(
             name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol
         )
-        prune_type = 'PatternLock' \
+        prune_type = 'pattern_lock' \
             if optim_args.pruning_approach else optim_args.pruning_approach
         target_sparsity_ratio = optim_args.target_sparsity_ratio \
             if optim_args.target_sparsity_ratio else None
-        pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)
-        pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric)
-        distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric)
-
-        objective = objectives.performance
-        quantization_conf = QuantizationConfig(
-            approach=optim_args.quantization_approach,
-            max_trials=600,
-            metrics=[tune_metric],
-            objectives=[objective]
-        )
+        trainer.metrics = tune_metric
+        pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}],
+                                            target_sparsity=target_sparsity_ratio,
+                                            pruning_scope="local",
+                                            pruning_type=prune_type)
+        distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"])
+        distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)
+        quantization_conf = QuantizationAwareTrainingConfig()
         conf_list = [pruning_conf, distillation_conf, quantization_conf]
-        model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)
+        model = trainer.orchestrate_optimizations(config_list=conf_list)
 
     if optim_args.benchmark or optim_args.accuracy_only:
         start_time = timeit.default_timer()
@@ -839,7 +838,7 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset):
         max_eval_samples = data_args.max_eval_samples \
             if data_args.max_eval_samples is not None else len(eval_dataset)
         eval_samples = min(max_eval_samples, len(eval_dataset))
-        samples = eval_samples - (eval_samples % batch_size) \
+        samples = eval_samples - (eval_samples % optim_args.batch_size) \
             if training_args.dataloader_drop_last else eval_samples
         logger.info("metrics keys: {}".format(results.keys()))
         bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
diff --git a/examples/huggingface/pytorch/text-classification/quantization/run_glue.py b/examples/huggingface/pytorch/text-classification/quantization/run_glue.py
index a6c1a96e04d..7a915a56a8a 100644
--- a/examples/huggingface/pytorch/text-classification/quantization/run_glue.py
+++ b/examples/huggingface/pytorch/text-classification/quantization/run_glue.py
@@ -550,25 +550,19 @@ def compute_metrics(p: EvalPrediction):
         )
         trainer.metrics = tune_metric
         objective = objectives.performance
+        tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name])
+        accuracy_criterion = AccuracyCriterion(
+            higher_is_better=True,  # optional.
+            criterion="relative" if optim_args.is_relative else "absolute",  # optional. Available values are "relative" and "absolute".
+            tolerable_loss=optim_args.perf_tol,  # optional.
+        )
         if optim_args.quantization_approach != "qat":
-            tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name])
-            accuracy_criterion = AccuracyCriterion(
-                higher_is_better=True,  # optional.
-                criterion="relative" if optim_args.is_relative else "absolute",  # optional. Available values are "relative" and "absolute".
-                tolerable_loss=optim_args.perf_tol,  # optional.
-            )
             quantization_config = PostTrainingQuantConfig(
                 approach=optim_args.quantization_approach,
                 tuning_criterion=tuning_criterion,
                 accuracy_criterion=accuracy_criterion
             )
         else:
-            tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"])
-            accuracy_criterion = AccuracyCriterion(
-                higher_is_better=True,  # optional.
-                criterion="relative" if optim_args.is_relative else "absolute",  # optional. Available values are "relative" and "absolute".
-                tolerable_loss=optim_args.perf_tol,  # optional.
-            )
             quantization_config = QuantizationAwareTrainingConfig(
                 tuning_criterion=tuning_criterion,
                 accuracy_criterion=accuracy_criterion
diff --git a/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py b/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py
index 128c9248341..832da5ceb52 100644
--- a/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py
+++ b/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py
@@ -19,9 +19,10 @@
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from huggingface_hub import HfFolder, Repository, whoami
-from intel_extension_for_transformers.transformers.config import (
+from neural_compressor.config import (
     DistillationConfig,
-    QuantizationConfig,
+    IntermediateLayersKnowledgeDistillationLossConfig,
+    QuantizationAwareTrainingConfig,
 )
 from intel_extension_for_transformers.transformers.utils import metrics, objectives
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
@@ -769,12 +770,7 @@ def train_func(model):
         tune_metric = metrics.Metric(name="")
         if args.do_quantization: 
             objective = objectives.performance
-            quantization_conf = QuantizationConfig(
-                approach="QuantizationAwareTraining",
-                max_trials=600,
-                metrics=[tune_metric],
-                objectives=[objective]
-            )
+            quantization_conf = QuantizationAwareTrainingConfig()
             conf_list.append(quantization_conf)
 
         if args.do_distillation:
@@ -828,17 +824,13 @@ def train_func(model):
                 [['mid_block.resnets.1', ]],
                 [['conv_out', ]],
             ]
-
-            distillation_conf = DistillationConfig(
-                framework="pytorch_fx", metrics=tune_metric,
-                criterion=Criterion(
-                    name="IntermediateLayersLoss",
-                    layer_mappings=layer_mappings,
-                    loss_types=["MSE"] * len(layer_mappings),
-                    loss_weight_ratio=[1.0 / len(layer_mappings)] * len(layer_mappings),
-                    add_origin_loss=True
-                )
+            criterion_conf = IntermediateLayersKnowledgeDistillationLossConfig(
+                layer_mappings=layer_mappings,
+                loss_types=["MSE"] * len(layer_mappings),
+                loss_weight_ratio=[1.0 / len(layer_mappings)] * len(layer_mappings),
+                add_origin_loss=True
             )
+            distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=criterion_conf)
             conf_list.append(distillation_conf)
         
         # Initialize our Trainer
@@ -846,10 +838,10 @@ def train_func(model):
             model=model,
             args=TrainingArguments(output_dir=args.output_dir),
         )
+        trainer.metrics = tune_metric
 
         model = trainer.orchestrate_optimizations(
             config_list=conf_list,
-            teacher_model=teacher_model,
             eval_func=lambda model:1,
             train_func=train_func)
 
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/README.md b/examples/huggingface/tensorflow/language-modeling/quantization/README.md
deleted file mode 100644
index 883689068bc..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-Step-by-Step
-=========
-
-This document describes the step-by-step instructions for reproducing the quantization on models for the Language Modeling tasks.
-
-There are mainly two kinds of language modeling tasks: Causal Language Modeling (CLM) and Masked Language Modeling (MLM). Two scripts `run_clm.py` and `run_mlm.py` provide quantization examples on the above two kinds of models based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). Users can easily run the quantization with `run_tuning.sh` and the benchmarking with `run_benchmark.sh`.
-
-Please note that language modeling tasks use `loss` as the evaluation metric so the loss will appear where the accuracy should be in the final tune result statistics, and the `greater_is_better=False` should be set in the Python scripts.
-
-Users can also change the `--max_training_samples`, `--max_eval_samples`, and `--max_seq_length` in the scripts for quicker debugging and to avoid potential lack of memory.
-
-# Prerequisite
-## 1. Installation
-
-Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example:
-
-```shell
-pip install intel-extension-for-transformers
-cd ptq
-pip install -r requirements.txt
-```
-
-# Run
-
-## 1. Run Command for the CLM task (Shell)
-
-- Topology:
-   - distilgpt2_clm
-
-* To get the int8 model
-
-```
-cd ptq
-bash run_tuning.sh  --topology=[topology]
-```
-
-* To benchmark the int8 model
-
-
-```
-cd ptq
-bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true
-```
-
-## 2. Run Command for the MLM task (Shell)
-
-- Topology:
-    - distilbert_mlm
-    - distilroberta_mlm
-
-* To get the int8 model
-
-```
-cd ptq
-bash run_tuning.sh  --topology=[topology]
-```
-
-* To benchmark the int8 model
-
-```
-cd ptq
-bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true
-```
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt
deleted file mode 100644
index 62aa53701f4..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets >= 1.17
-sentencepiece != 0.1.92
-protobuf
-intel-tensorflow
-transformers
-scikit-learn
-accelerate
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh
deleted file mode 100644
index e3cb8c3c55a..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  topology="distilgpt2_clm"
-  iters=100
-  batch_size=16
-  tuned_checkpoint=saved_results
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-    MAX_SEQ_LENGTH=128
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy_only"
-    elif [[ ${mode} == "benchmark" ]]; then
-        mode_cmd=" --benchmark "
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [ "${topology}" = "distilgpt2_clm" ]; then
-        script="run_clm.py"
-        dataset_name="wikitext"
-        model_name_or_path="distilgpt2"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 196 --block_size 128"
-    elif [ "${topology}" = "distilbert_mlm" ]; then
-        script="run_mlm.py"
-        dataset_name="wikitext"
-        model_name_or_path="distilbert-base-cased"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 196 --max_seq_length 128"
-    elif [ "${topology}" = "distilroberta_mlm" ]; then
-        script="run_mlm.py"
-        dataset_name="wikitext"
-        model_name_or_path="Rocketknight1/distilroberta-base-finetuned-wikitext2"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 196 --max_seq_length 128"
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-    python -u ../${script} \
-        --model_name_or_path ${model_name_or_path} \
-        --dataset_name ${dataset_name} \
-        --dataset_config_name ${dataset_config_name} \
-        --do_eval \
-        --per_device_eval_batch_size ${batch_size} \
-        --output_dir ${tuned_checkpoint} \
-        --overwrite_output_dir \
-        --cache_dir ${cache_dir} \
-        ${mode_cmd} \
-        ${extra_cmd}
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh
deleted file mode 100644
index 6ffa270911f..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  topology="distilgpt2_clm"
-  tuned_checkpoint="saved_results"
-  extra_cmd=""
-  batch_size=8
-  MAX_SEQ_LENGTH=128
-  model_type="bert"
-  approach="PostTrainingStatic"
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_model=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    if [ "${topology}" = "distilgpt2_clm" ]; then
-        script="run_clm.py"
-        model_name_or_path="distilgpt2"
-        dataset_name="wikitext"
-        approach="PostTrainingStatic"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove or change following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 96 --block_size 128 --perf_tol 0.08"
-    elif [ "${topology}" = "distilbert_mlm" ]; then
-        script="run_mlm.py"
-        model_name_or_path="distilbert-base-cased"
-        dataset_name="wikitext"
-        approach="PostTrainingStatic"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove or change following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 96 --max_seq_length 128 --perf_tol 0.08"
-    elif [ "${topology}" = "distilroberta_mlm" ]; then
-        script="run_mlm.py"
-        model_name_or_path="Rocketknight1/distilroberta-base-finetuned-wikitext2"
-        dataset_name="wikitext"
-        approach="PostTrainingStatic"
-        dataset_config_name="wikitext-2-raw-v1"
-        # remove or change following two parameters if you have enough memory
-        extra_cmd=$extra_cmd" --max_eval_samples 96 --max_seq_length 128 --perf_tol 0.08"
-    fi
-
-    if [ "${worker}" = "" ]
-    then
-        python -u ../${script} \
-            --model_name_or_path ${model_name_or_path} \
-            --dataset_name ${dataset_name} \
-            --dataset_config_name ${dataset_config_name} \
-            --do_eval \
-            --output_dir ${tuned_checkpoint} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --tune \
-            ${extra_cmd}
-    else
-        python -u ../${script} \
-            --model_name_or_path ${model_name_or_path} \
-            --dataset_name ${dataset_name} \
-            --dataset_config_name ${dataset_config_name} \
-            --task_name ${TASK_NAME} \
-            --do_eval \
-            --output_dir ${tuned_checkpoint} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --tune \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py b/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py
deleted file mode 100644
index 1b82d1ccf0f..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py
+++ /dev/null
@@ -1,814 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (GPT-2, GPT-Neo...)
-on a text file or a dataset without using HuggingFace Trainer.
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own clm task. Pointers for this are left as comments.
-
-import json
-
-# region Imports
-import logging
-import math
-import os
-import random
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from pathlib import Path
-from typing import Optional
-import time
-
-import numpy as np
-import datasets
-import tensorflow as tf
-from datasets import load_dataset, load_metric
-from sklearn.model_selection import train_test_split
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    CONFIG_NAME,
-    TF2_WEIGHTS_NAME,
-    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    HfArgumentParser,
-    TFAutoModelForCausalLM,
-    TFTrainingArguments,
-    create_optimizer,
-    set_seed,
-)
-from transformers.utils.versions import require_version
-
-logger = logging.getLogger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/language-modeling/requirements.txt")
-MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-# endregion
-
-# region Command-line arguments
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to `max_seq_length`. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    block_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Optional input sequence length after tokenization. "
-                "The training dataset will be truncated in block of this size for training. "
-                "Default to the model max input length for single sentence inputs (take into account special tokens)."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    line_by_line: bool = field(
-        default=False,
-        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    keep_linebreaks: bool = field(
-        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    tune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply quantization."},
-    )
-    quantization_approach: Optional[str] = field(
-        default="PostTrainingStatic",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                "PostTrainingDynamic and QuantizationAwareTraining."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    is_relative: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "run benchmark."})
-    int8: bool = field(
-        default=False,
-        metadata={"help":"Whether to use the quantized int8 model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-
-
-# endregion
-
-def main():
-    # region Argument Parsing
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses()
-
-    # region Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # Sanity checks
-    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
-        raise ValueError("Need either a dataset name or a training/validation file.")
-    else:
-        if data_args.train_file is not None:
-            extension = data_args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
-        if data_args.validation_file is not None:
-            extension = data_args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
-
-    if training_args.output_dir is not None:
-        training_args.output_dir = Path(training_args.output_dir)
-        os.makedirs(training_args.output_dir, exist_ok=True)
-    # endregion
-
-     # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-    # region Checkpoints
-    # Detecting last checkpoint.
-    checkpoint = None
-    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir and not training_args.do_eval:
-        config_path = training_args.output_dir / CONFIG_NAME
-        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
-        if config_path.is_file() and weights_path.is_file():
-            checkpoint = training_args.output_dir
-            logger.info(
-                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
-                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-        else:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to continue regardless."
-            )
-
-    # endregion
-
-    # If passed along, set the training seed now.
-    if training_args.seed is not None:
-        set_seed(training_args.seed)
-
-    # region Load datasets
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-            **dataset_args,
-        )
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-    # endregion
-    # region Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    # endregion
-
-
-    # region Dataset preprocessing
-    # First we tokenize all the texts.
-    column_names = raw_datasets["train"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    def tokenize_function(examples):
-        return tokenizer(examples[text_column_name], return_token_type_ids=True)
-
-    tokenized_datasets = raw_datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not data_args.overwrite_cache,
-        desc="Running tokenizer on dataset",
-    )
-
-    if data_args.block_size is None:
-        block_size = tokenizer.model_max_length
-        if block_size > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
-            )
-            block_size = 1024
-    else:
-        if data_args.block_size > tokenizer.model_max_length:
-            logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
-            )
-        block_size = min(data_args.block_size, tokenizer.model_max_length)
-
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
-
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
-    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
-    # to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
-    tokenized_datasets = tokenized_datasets.map(
-        group_texts,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-        desc=f"Grouping texts in chunks of {block_size}",
-    )
-
-    train_dataset = tokenized_datasets["train"]
-    if data_args.validation_file is not None:
-        eval_dataset = tokenized_datasets["validation"]
-    else:
-        logger.info(
-            f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation"
-            " as provided in data_args"
-        )
-        train_indices, val_indices = train_test_split(
-            list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100
-        )
-
-        eval_dataset = train_dataset.select(val_indices)
-        train_dataset = train_dataset.select(train_indices)
-
-    if data_args.max_train_samples is not None:
-        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-        train_dataset = train_dataset.select(range(max_train_samples))
-    if data_args.max_eval_samples is not None:
-        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-        eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-    # endregion
-
-    with strategy.scope():
-        # region Prepare model
-        if checkpoint is not None:
-            model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config, cache_dir=model_args.cache_dir,)
-        elif model_args.model_name_or_path:
-            model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config,
-                cache_dir=model_args.cache_dir, revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,)
-        else:
-            logger.info("Training new model from scratch")
-            model = TFAutoModelForCausalLM.from_config(config)
-
-        model.resize_token_embeddings(len(tokenizer))
-        # endregion
-
-        # region TF Dataset preparation
-        num_replicas = (len(worker_list) if worker_list is not None else 1)
-        options = tf.data.Options()
-        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-
-        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
-        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
-        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
-        # yourself if you use this method, whereas they are automatically inferred from the model input names when
-        # using model.prepare_tf_dataset()
-        # For more info see the docs:
-        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
-        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
-
-        if model_args.model_name_or_path == "distilgpt2":
-            train_dataset = train_dataset.remove_columns('token_type_ids')
-            eval_dataset = eval_dataset.remove_columns('token_type_ids')
-
-        tf_train_dataset = model.prepare_tf_dataset(
-            train_dataset,
-            shuffle=True,
-            batch_size=num_replicas * training_args.per_device_train_batch_size,
-        ).with_options(options)
-
-        tf_eval_dataset = model.prepare_tf_dataset(
-            eval_dataset,
-            shuffle=False,
-            batch_size=num_replicas * training_args.per_device_eval_batch_size,
-            drop_remainder=True,
-        ).with_options(options)
-        # endregion
-
-        # region Optimizer and loss
-        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
-        if training_args.warmup_steps > 0:
-            num_warmup_steps = training_args.warmup_steps
-        elif training_args.warmup_ratio > 0:
-            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
-        else:
-            num_warmup_steps = 0
-
-        # Bias and layernorm weights are automatically excluded from the decay
-        optimizer, lr_schedule = create_optimizer(
-            init_lr=training_args.learning_rate,
-            num_train_steps=num_train_steps,
-            num_warmup_steps=num_warmup_steps,
-            adam_beta1=training_args.adam_beta1,
-            adam_beta2=training_args.adam_beta2,
-            adam_epsilon=training_args.adam_epsilon,
-            weight_decay_rate=training_args.weight_decay,
-            adam_global_clipnorm=training_args.max_grad_norm,
-        )
-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
-    
-    def compute_metrics(preds, labels):
-        preds = preds["logits"]
-        # preds have the same shape as the labels, after the argmax(-1) has been calculated
-        # by preprocess_logits_for_metrics but we need to shift the labels
-        labels = labels[:, 1:]
-        preds = preds[:, :-1]
-        return hf_compute_loss(labels, preds)
-
-    # loss function for CLM model
-    def hf_compute_loss(labels, logits):
-        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
-        )
-
-        # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
-        unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
-        # make sure only labels that are not equal to -100 affect the loss
-        loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
-        masked_loss = unmasked_loss * loss_mask
-        reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
-        return tf.reshape(reduced_masked_loss, (1,))
-
-    def eval_func_clm(model):
-        label_ids: np.ndarray = None
-
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-        logger.info(f"***** Running Evaluation *****")
-        logger.info(f"  Num examples in dataset = {num_examples}")
-        logger.info(f"  Batch size = {training_args.per_device_eval_batch_size}")
-
-        preds: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        for idx, (inputs, labels) in enumerate(tf_eval_dataset):
-            for name in inputs:
-                inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-            results = infer(**inputs)
-            if preds is None:
-                preds = results["Identity"].numpy()
-            else:
-                preds = np.append(preds, results["Identity"].numpy(), axis=0)
-
-            if label_ids is None:
-                label_ids = labels[0].numpy() if isinstance(
-                    labels, list) else labels.numpy()
-            else:
-                label_ids = np.append(
-                    label_ids,
-                    labels[0].numpy()
-                    if isinstance(labels, list) else labels.numpy(),
-                    axis=0)
-        test_predictions = {"logits": preds}
-        loss = compute_metrics(test_predictions, label_ids)
-
-        return loss.numpy()[0]
-
-    # region tuning
-    if optim_args.tune:
-        from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_train_dataset,
-            eval_dataset=tf_eval_dataset,
-            compute_metrics=compute_metrics,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-
-        # use customized eval function
-        optimization.eval_func = eval_func_clm
-
-        tune_metric = metrics.Metric(
-            name="loss", greater_is_better=False, is_relative=True, criterion=optim_args.perf_tol,
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = optimization.quantize(quant_config=quantization_config)
-        exit(0)
-    # endregion
-
-    # region Training and validation
-    if training_args.do_train:
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {len(train_dataset)}")
-        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
-
-        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
-        # to the Hugging Face Hub rather than just pushing the finished model.
-        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
-        history = model.fit(
-            tf_train_dataset,
-            validation_data=tf_eval_dataset,
-            epochs=int(training_args.num_train_epochs),
-        )
-        train_loss = history.history["loss"][-1]
-        try:
-            train_perplexity = math.exp(train_loss)
-        except OverflowError:
-            train_perplexity = math.inf
-        logger.info(f"  Final train loss: {train_loss:.3f}")
-        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
-        validation_loss = history.history["val_loss"][-1]
-        try:
-            validation_perplexity = math.exp(validation_loss)
-        except OverflowError:
-            validation_perplexity = math.inf
-        logger.info(f"  Final validation loss: {validation_loss:.3f}")
-        logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
-
-        if training_args.output_dir is not None:
-            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
-            results_dict = dict()
-            results_dict["train_loss"] = train_loss
-            results_dict["train_perplexity"] = train_perplexity
-            results_dict["eval_loss"] = validation_loss
-            results_dict["eval_perplexity"] = validation_perplexity
-            with open(output_eval_file, "w") as writer:
-                writer.write(json.dumps(results_dict))
-
-        if training_args.output_dir is not None and not training_args.push_to_hub:
-            # If we're not pushing to hub, at least save a local copy when we're done
-            model.save_pretrained(training_args.output_dir)
-    # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-        else:
-            from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel
-            model = keras2SavedModel(model)
-
-        preds: np.ndarray = None
-        label_ids: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        if optim_args.accuracy_only:
-            iterations = 1
-            warmup = 0
-        else:
-            iterations = 10
-            warmup = 5
-        latency_list = []
-
-        for idx in range(iterations):
-            iteration_time = 0
-            for i, (inputs, labels) in enumerate(tf_eval_dataset):
-                for name in inputs:
-                    inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-                start = time.time()
-                results = infer(**inputs)
-                iteration_time += time.time() - start
-                if idx == 0:    # only accumulate once all the preds and labels
-                    if preds is None:
-                        preds = results["Identity"].numpy()
-                    else:
-                        preds = np.append(preds, results["Identity"].numpy(), axis=0)
-                    if label_ids is None:
-                        label_ids = labels[0].numpy() if isinstance(
-                            labels, list) else labels.numpy()
-                    else:
-                        label_ids = np.append(
-                            label_ids,
-                            labels[0].numpy()
-                            if isinstance(labels, list) else labels.numpy(),
-                            axis=0)
-            latency_list.append(iteration_time)
-            logger.info("Iteration {} time: {} sec".format(idx, iteration_time))
-
-        loss = compute_metrics({"logits": preds}, label_ids)
-        logger.info("\nEvaluation result: ")
-        logger.info("Accuracy: {}".format(loss.numpy()[0]))
-
-        average_iteration_time = np.array(latency_list[warmup:]).mean()
-        logger.info(
-            "Throughput: {} samples/sec".format(
-                num_examples / average_iteration_time)
-        )
-    #endregion
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py b/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py
deleted file mode 100644
index be683113ccf..00000000000
--- a/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py
+++ /dev/null
@@ -1,848 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
-on a text file or a dataset without using HuggingFace Trainer.
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=fill-mask
-"""
-# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
-
-import json
-import logging
-import math
-import os
-import random
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from pathlib import Path
-from typing import Optional
-import time
-
-import datasets
-import tensorflow as tf
-from datasets import load_dataset
-from sklearn.model_selection import train_test_split
-
-import numpy as np
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    CONFIG_NAME,
-    TF2_WEIGHTS_NAME,
-    TF_MODEL_FOR_MASKED_LM_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    DataCollatorForLanguageModeling,
-    HfArgumentParser,
-    PushToHubCallback,
-    TFAutoModelForMaskedLM,
-    TFTrainingArguments,
-    create_optimizer,
-    set_seed,
-)
-
-from transformers.utils.versions import require_version
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-logger = logging.getLogger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/language-modeling/requirements.txt")
-MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_MASKED_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-# region Command-line arguments
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    line_by_line: bool = field(
-        default=False,
-        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    tune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply quantization."},
-    )
-    quantization_approach: Optional[str] = field(
-        default="PostTrainingStatic",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                "PostTrainingDynamic and QuantizationAwareTraining."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    is_relative: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "run benchmark."})
-    int8: bool = field(
-        default=False,
-        metadata={"help":"Whether to use the quantized int8 model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-
-
-# endregion
-
-
-def main():
-    # region Argument Parsing
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses()
-
-    # region Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # Sanity checks
-    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
-        raise ValueError("Need either a dataset name or a training/validation file.")
-    else:
-        if data_args.train_file is not None:
-            extension = data_args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
-        if data_args.validation_file is not None:
-            extension = data_args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
-
-    if training_args.output_dir is not None:
-        training_args.output_dir = Path(training_args.output_dir)
-        os.makedirs(training_args.output_dir, exist_ok=True)
-    # endregion
-
-    # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-
-    # region Checkpoints
-    # Detecting last checkpoint.
-    checkpoint = None
-    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
-        config_path = training_args.output_dir / CONFIG_NAME
-        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
-        if config_path.is_file() and weights_path.is_file():
-            checkpoint = training_args.output_dir
-            logger.warning(
-                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
-                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-        else:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to continue regardless."
-            )
-
-    # endregion
-
-    # If passed along, set the training seed now.
-    if training_args.seed is not None:
-        set_seed(training_args.seed)
-
-    # region Load datasets
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-            **dataset_args,
-        )
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-    # endregion
-
-    # region Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    # endregion
-
-    # region Dataset preprocessing
-    # First we tokenize all the texts.
-    column_names = raw_datasets["train"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can reduce that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    if data_args.line_by_line:
-        # When using line_by_line, we just tokenize each nonempty line.
-        padding = "max_length" if data_args.pad_to_max_length else False
-
-        def tokenize_function(examples):
-            # Remove empty lines
-            examples[text_column_name] = [
-                line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
-            ]
-            return tokenizer(
-                examples[text_column_name],
-                padding=padding,
-                truncation=True,
-                max_length=max_seq_length,
-                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
-                # receives the `special_tokens_mask`.
-                return_special_tokens_mask=True,
-                return_token_type_ids=True
-            )
-
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[text_column_name],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset line_by_line",
-        )
-    else:
-        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
-        # efficient when it receives the `special_tokens_mask`.
-        def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_special_tokens_mask=True, return_token_type_ids=True)
-
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on every text in dataset",
-        )
-
-        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
-        # max_seq_length.
-        def group_texts(examples):
-            # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
-            # Split by chunks of max_len.
-            result = {
-                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
-                for k, t in concatenated_examples.items()
-            }
-            return result
-
-        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-        # might be slower to preprocess.
-        #
-        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
-        tokenized_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc=f"Grouping texts in chunks of {max_seq_length}",
-        )
-
-    train_dataset = tokenized_datasets["train"]
-
-    if data_args.validation_file is not None:
-        eval_dataset = tokenized_datasets["validation"]
-    else:
-        logger.info(
-            f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation"
-            " as provided in data_args"
-        )
-        train_indices, val_indices = train_test_split(
-            list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100
-        )
-
-        eval_dataset = train_dataset.select(val_indices)
-        train_dataset = train_dataset.select(train_indices)
-
-    if data_args.max_train_samples is not None:
-        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-        train_dataset = train_dataset.select(range(max_train_samples))
-    if data_args.max_eval_samples is not None:
-        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-        eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-    # endregion
-
-    with strategy.scope():
-        # region Prepare model
-        if checkpoint is not None:
-            model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, config=config, cache_dir=model_args.cache_dir,)
-        elif model_args.model_name_or_path:
-            model = TFAutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path, config=config,
-                cache_dir=model_args.cache_dir, revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,)
-        else:
-            logger.info("Training new model from scratch")
-            model = TFAutoModelForMaskedLM.from_config(config)
-
-        model.resize_token_embeddings(len(tokenizer))
-        # endregion
-
-        # region TF Dataset preparation
-        num_replicas = training_args.strategy.num_replicas_in_sync
-        data_collator = DataCollatorForLanguageModeling(
-            tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="tf"
-        )
-        options = tf.data.Options()
-        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-
-        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
-        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
-        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
-        # yourself if you use this method, whereas they are automatically inferred from the model input names when
-        # using model.prepare_tf_dataset()
-        # For more info see the docs:
-        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
-        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
-
-        tf_train_dataset = model.prepare_tf_dataset(
-            train_dataset,
-            shuffle=True,
-            batch_size=num_replicas * training_args.per_device_train_batch_size,
-            collate_fn=data_collator,
-        ).with_options(options)
-
-        tf_eval_dataset = model.prepare_tf_dataset(
-            eval_dataset,
-            # labels are passed as input, as we will use the model's internal loss
-            shuffle=False,
-            batch_size=num_replicas * training_args.per_device_eval_batch_size,
-            collate_fn=data_collator,
-            drop_remainder=True,
-        ).with_options(options)
-        # endregion
-
-        # region Optimizer and loss
-        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
-        if training_args.warmup_steps > 0:
-            num_warmup_steps = training_args.warmup_steps
-        elif training_args.warmup_ratio > 0:
-            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
-        else:
-            num_warmup_steps = 0
-
-        # Bias and layernorm weights are automatically excluded from the decay
-        optimizer, lr_schedule = create_optimizer(
-            init_lr=training_args.learning_rate,
-            num_train_steps=num_train_steps,
-            num_warmup_steps=num_warmup_steps,
-            adam_beta1=training_args.adam_beta1,
-            adam_beta2=training_args.adam_beta2,
-            adam_epsilon=training_args.adam_epsilon,
-            weight_decay_rate=training_args.weight_decay,
-            adam_global_clipnorm=training_args.max_grad_norm,
-        )
-
-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
-        # endregion
-
-    def compute_metrics(preds, labels):
-        preds = preds["logits"]
-        return hf_compute_loss(labels, preds)
-
-    # loss function for CLM model
-    def hf_compute_loss(labels, logits):
-        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
-        )
-
-        # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
-        unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
-        # make sure only labels that are not equal to -100 affect the loss
-        loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype)
-        masked_loss = unmasked_loss * loss_mask
-        reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
-        return tf.reshape(reduced_masked_loss, (1,))
-
-    def eval_func_mlm(model):
-        label_ids: np.ndarray = None
-
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-        logger.info(f"***** Running Evaluation *****")
-        logger.info(f"  Num examples in dataset = {num_examples}")
-        logger.info(f"  Batch size = {training_args.per_device_eval_batch_size}")
-
-        preds: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        for idx, (inputs, labels) in enumerate(tf_eval_dataset):
-            for name in inputs:
-                inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-            results = infer(**inputs)
-
-            if preds is None:
-                preds = results["Identity"].numpy()
-            else:
-                preds = np.append(preds, results["Identity"].numpy(), axis=0)
-
-            if label_ids is None:
-                label_ids = labels[0].numpy() if isinstance(
-                    labels, list) else labels.numpy()
-            else:
-                label_ids = np.append(
-                    label_ids,
-                    labels[0].numpy()
-                    if isinstance(labels, list) else labels.numpy(),
-                    axis=0)
-        test_predictions = {"logits": preds}
-        loss = compute_metrics(test_predictions, label_ids)
-
-        return loss.numpy()[0]
-
-    # region tuning
-    if optim_args.tune:
-        from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_train_dataset,
-            eval_dataset=tf_eval_dataset,
-            compute_metrics=compute_metrics,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-
-        # use customized eval function
-        optimization.eval_func = eval_func_mlm
-
-        tune_metric = metrics.Metric(
-            name="loss", greater_is_better=False, is_relative=True, criterion=optim_args.perf_tol,
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = optimization.quantize(quant_config=quantization_config)
-        exit(0)
-    # endregion
-
-    # region Training and validation
-    if training_args.do_train:
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {len(train_dataset)}")
-        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
-
-        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
-        # to the Hugging Face Hub rather than just pushing the finished model.
-        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
-
-        history = model.fit(
-            tf_train_dataset,
-            validation_data=tf_eval_dataset,
-            epochs=int(training_args.num_train_epochs),
-            callbacks=callbacks,
-        )
-        train_loss = history.history["loss"][-1]
-        try:
-            train_perplexity = math.exp(train_loss)
-        except OverflowError:
-            train_perplexity = math.inf
-        logger.info(f"  Final train loss: {train_loss:.3f}")
-        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
-
-        validation_loss = history.history["val_loss"][-1]
-        try:
-            validation_perplexity = math.exp(validation_loss)
-        except OverflowError:
-            validation_perplexity = math.inf
-        logger.info(f"  Final validation loss: {validation_loss:.3f}")
-        logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
-
-        if training_args.output_dir is not None:
-            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
-            results_dict = dict()
-            results_dict["train_loss"] = train_loss
-            results_dict["train_perplexity"] = train_perplexity
-            results_dict["eval_loss"] = validation_loss
-            results_dict["eval_perplexity"] = validation_perplexity
-            with open(output_eval_file, "w") as writer:
-                writer.write(json.dumps(results_dict))
-            # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-        else:
-            from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel
-            model = keras2SavedModel(model)
-
-        preds: np.ndarray = None
-        label_ids: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        if optim_args.accuracy_only:
-            iterations = 1
-            warmup = 0
-        else:
-            iterations = 10
-            warmup = 5
-        latency_list = []
-
-        for idx in range(iterations):
-            iteration_time = 0
-            for i, (inputs, labels) in enumerate(tf_eval_dataset):
-                for name in inputs:
-                    inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-                start = time.time()
-                results = infer(**inputs)
-                iteration_time += time.time() - start
-                if idx == 0:    # only accumulate once all the preds and labels
-                    if preds is None:
-                        preds = results["Identity"].numpy()
-                    else:
-                        preds = np.append(preds, results["Identity"].numpy(), axis=0)
-
-                    if label_ids is None:
-                        label_ids = labels[0].numpy() if isinstance(
-                            labels, list) else labels.numpy()
-                    else:
-                        label_ids = np.append(
-                            label_ids,
-                            labels[0].numpy()
-                            if isinstance(labels, list) else labels.numpy(),
-                            axis=0)
-            latency_list.append(iteration_time)
-            logger.info("Iteration {} time: {} sec".format(idx, iteration_time))
-
-        loss = compute_metrics({"logits": preds}, label_ids)
-        logger.info("\nEvaluation result: ")
-        logger.info("Accuracy: {}".format(loss.numpy()[0]))
-
-        average_iteration_time = np.array(latency_list[warmup:]).mean()
-        logger.info(
-            "Throughput: {} samples/sec".format(
-                num_examples / average_iteration_time)
-            )
-    #endregion
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/README.md b/examples/huggingface/tensorflow/multiple-choice/quantization/README.md
deleted file mode 100644
index d204e4f0ae0..00000000000
--- a/examples/huggingface/tensorflow/multiple-choice/quantization/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-Step-by-Step
-=========
-
-This document describes the step-by-step instructions for reproducing the quantization on models for the multiple choice tasks on the SWAG dataset.
-
-# Prerequisite
-## 1. Installation
-
-Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example:
-
-```shell
-pip install intel-extension-for-transformers
-pip install -r requirements.txt
-```
-
-# Run
-
-## 1. Run Command (Shell)
-
-- Topology:
-   - distilbert_swag
-
-- To get the int8 model
-
-```
-bash run_tuning.sh  --topology=[topology]
-```
-
-- To benchmark the int8 model
-
-
-```
-bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true
-```
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt b/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt
deleted file mode 100644
index ffa62da04e1..00000000000
--- a/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-datasets >= 1.17
-sentencepiece != 0.1.92
-protobuf
-intel-tensorflow
-transformers
-accelerate
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh b/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh
deleted file mode 100644
index d43bc97a53f..00000000000
--- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  topology="distilbert_swag"
-  iters=100
-  batch_size=16
-  tuned_checkpoint=saved_results
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-    MAX_SEQ_LENGTH=128
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy_only"
-    elif [[ ${mode} == "benchmark" ]]; then
-        mode_cmd=" --benchmark "
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [ "${topology}" = "distilbert_swag" ]; then
-        script="run_swag.py"
-        model_name_or_path="Rocketknight1/bert-base-uncased-finetuned-swag"
-        # add following parameters for quicker debugging
-        extra_cmd=$extra_cmd" --max_eval_samples 512"
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-    python -u ${script} \
-        --model_name_or_path ${model_name_or_path} \
-        --do_eval \
-        --per_device_eval_batch_size ${batch_size} \
-        --output_dir ${tuned_checkpoint} \
-        --overwrite_output_dir \
-        --cache_dir ${cache_dir} \
-        ${mode_cmd} \
-        ${extra_cmd}
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py b/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py
deleted file mode 100644
index dff1ae14227..00000000000
--- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py
+++ /dev/null
@@ -1,653 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for multiple choice.
-"""
-# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
-
-import json
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from pathlib import Path
-from typing import Optional, Union
-import numpy as np
-
-import datasets
-import tensorflow as tf
-from datasets import load_dataset
-
-import time
-
-import transformers
-from transformers import (
-    CONFIG_NAME,
-    TF2_WEIGHTS_NAME,
-    AutoConfig,
-    AutoTokenizer,
-    DefaultDataCollator,
-    HfArgumentParser,
-    PushToHubCallback,
-    TFAutoModelForMultipleChoice,
-    TFTrainingArguments,
-    create_optimizer,
-    set_seed,
-)
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
-from transformers.trainer_utils import is_main_process
-
-logger = logging.getLogger(__name__)
-
-
-# region Helper classes and functions
-
-
-@dataclass
-class DataCollatorForMultipleChoice:
-    """
-    Data collator that will dynamically pad the inputs for multiple choice received.
-    Args:
-        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
-            The tokenizer used for encoding the data.
-        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
-              if provided).
-            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-              acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-              lengths).
-        max_length (`int`, *optional*):
-            Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (`int`, *optional*):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    tokenizer: PreTrainedTokenizerBase
-    padding: Union[bool, str, PaddingStrategy] = True
-    max_length: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-
-    def __call__(self, features):
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature.pop(label_name) for feature in features]
-        batch_size = len(features)
-        num_choices = len(features[0]["input_ids"])
-        flattened_features = [
-            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-        ]
-        flattened_features = list(chain(*flattened_features))
-
-        batch = self.tokenizer.pad(
-            flattened_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="tf",
-        )
-
-        # Un-flatten
-        batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
-        # Add back labels
-        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
-        return batch
-
-
-# endregion
-
-# region Arguments
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. If passed, sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to the maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for TPU."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.train_file is not None:
-            extension = self.train_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-        if self.validation_file is not None:
-            extension = self.validation_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-
-
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    tune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply quantization."},
-    )
-    quantization_approach: Optional[str] = field(
-        default="PostTrainingStatic",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                "PostTrainingDynamic and QuantizationAwareTraining."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    is_relative: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "run benchmark."})
-    int8: bool = field(
-        default=False,
-        metadata={"help":"Whether to use the quantized int8 model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-
-# endregion
-
-
-
-def main():
-    # region Argument Parsing
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses()
-
-    output_dir = Path(training_args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # endregion
-
-    # region Logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-    # region Checkpoints
-    checkpoint = None
-    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
-        if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file():
-            checkpoint = output_dir
-            logger.info(
-                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
-                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-        else:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to continue regardless."
-            )
-    # endregion
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # region Load datasets
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.train_file is not None or data_args.validation_file is not None:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-    else:
-        # Downloading and loading the swag dataset from the hub.
-        raw_datasets = load_dataset(
-            "swag",
-            "regular",
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # When using your own dataset or a different dataset from swag, you will probably need to change this.
-    ending_names = [f"ending{i}" for i in range(4)]
-    context_name = "sent1"
-    question_header_name = "sent2"
-    # endregion
-
-    # region Load model config and tokenizer
-    if checkpoint is not None:
-        config_path = training_args.output_dir
-    elif model_args.config_name:
-        config_path = model_args.config_name
-    else:
-        config_path = model_args.model_name_or_path
-
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        config_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    # endregion
-
-    # region Dataset preprocessing
-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_function(examples):
-        first_sentences = [[context] * 4 for context in examples[context_name]]
-        question_headers = examples[question_header_name]
-        second_sentences = [
-            [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
-        ]
-
-        # Flatten out
-        first_sentences = list(chain(*first_sentences))
-        second_sentences = list(chain(*second_sentences))
-
-        # Tokenize
-        tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
-        # Un-flatten
-        data = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
-        return data
-
-
-    train_dataset = raw_datasets["train"]
-    if data_args.max_train_samples is not None:
-        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-        train_dataset = train_dataset.select(range(max_train_samples))
-    train_dataset = train_dataset.map(
-        preprocess_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-
-    eval_dataset = raw_datasets["validation"]
-    if data_args.max_eval_samples is not None:
-        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-        eval_dataset = eval_dataset.select(range(max_eval_samples))
-    eval_dataset = eval_dataset.map(
-        preprocess_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-    if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
-    else:
-        # custom class defined above, as HF has no data collator for multiple choice
-        data_collator = DataCollatorForMultipleChoice(tokenizer)
-    # endregion
-
-    with strategy.scope():
-        # region Build model
-        if checkpoint is None:
-            model_path = model_args.model_name_or_path
-        else:
-            model_path = checkpoint
-        model = TFAutoModelForMultipleChoice.from_pretrained(
-            model_path,
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-
-        num_replicas = training_args.strategy.num_replicas_in_sync
-        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
-        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
-
-        num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
-        if training_args.warmup_steps > 0:
-            num_warmup_steps = training_args.warmup_steps
-        elif training_args.warmup_ratio > 0:
-            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
-        else:
-            num_warmup_steps = 0
-        optimizer, lr_schedule = create_optimizer(
-            init_lr=training_args.learning_rate,
-            num_train_steps=num_train_steps,
-            num_warmup_steps=num_warmup_steps,
-            adam_beta1=training_args.adam_beta1,
-            adam_beta2=training_args.adam_beta2,
-            adam_epsilon=training_args.adam_epsilon,
-            weight_decay_rate=training_args.weight_decay,
-            adam_global_clipnorm=training_args.max_grad_norm,
-        )
-
-
-        dataset_options = tf.data.Options()
-        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-
-        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
-        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
-        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
-        # yourself if you use this method, whereas they are automatically inferred from the model input names when
-        # using model.prepare_tf_dataset()
-        # For more info see the docs:
-        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
-        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
-
-        tf_train_dataset = model.prepare_tf_dataset(
-            train_dataset,
-            shuffle=True,
-            batch_size=total_train_batch_size,
-            collate_fn=data_collator,
-        ).with_options(dataset_options)
-
-        tf_eval_dataset = model.prepare_tf_dataset(
-            eval_dataset,
-            shuffle=False,
-            batch_size=total_eval_batch_size,
-            collate_fn=data_collator,
-            drop_remainder=True,
-        ).with_options(dataset_options)
-
-        model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
-        # endregion
-
-    def compute_metrics(preds, labels):
-        predictions = preds["logits"]
-        preds = np.argmax(predictions, axis=1)
-        return {"accuracy": (preds == labels).astype(np.float32).mean().item()}
-
-    # region tuning
-    if optim_args.tune:
-        from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_train_dataset,
-            eval_dataset=tf_eval_dataset,
-            compute_metrics=compute_metrics,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-
-        # use customized eval function
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol,
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = optimization.quantize(quant_config=quantization_config)
-        exit(0)
-    # endregion
-
-    # region Training
-    eval_metrics = None
-    if training_args.do_train:
-        history = model.fit(
-            tf_train_dataset,
-            validation_data=tf_eval_dataset,
-            epochs=int(training_args.num_train_epochs),
-        )
-        model.save("finetuned_model")
-        eval_metrics = {key: val[-1] for key, val in history.history.items()}
-        # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-        else:
-            from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel
-            model = keras2SavedModel(model)
-
-        logger.info(f"***** Running Evaluation *****")
-        logger.info(f"  Num examples in dataset = {num_examples}")
-        logger.info(f"  Batch size = {training_args.per_device_eval_batch_size}")
-
-        preds: np.ndarray = None
-        label_ids: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        if optim_args.accuracy_only:
-            iterations = 1
-            warmup = 0
-        else:
-            iterations = 10
-            warmup = 5
-        latency_list = []
-
-        for idx in range(iterations):
-            iteration_time = 0
-            for i, (inputs, labels) in enumerate(tf_eval_dataset):
-                for name in inputs:
-                    inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-                start = time.time()
-                results = infer(**inputs)
-                iteration_time += time.time() - start
-                if idx == 0:    # only accumulate once all the preds and labels
-                    if preds is None:
-                        preds = results["Identity"].numpy()
-                    else:
-                        preds = np.append(preds, results["Identity"].numpy(), axis=0)
-                    if label_ids is None:
-                        label_ids = labels[0].numpy() if isinstance(
-                            labels, list) else labels.numpy()
-                    else:
-                        label_ids = np.append(
-                            label_ids,
-                            labels[0].numpy() if isinstance(labels, list) else labels.numpy(),
-                            axis=0)
-            latency_list.append(iteration_time)
-            logger.info("Iteration {} time: {} sec".format(idx, iteration_time))
-
-        test_predictions = {"logits": preds}
-        eval_metrics = compute_metrics(test_predictions, label_ids)
-        logger.info("\nEvaluation result: ")
-        logger.info("Accuracy: {}".format(eval_metrics["accuracy"]))
-
-        average_iteration_time = np.array(latency_list[warmup:]).mean()
-        logger.info(
-            "Throughput: {} samples/sec".format(
-                num_examples / average_iteration_time)
-        )
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh b/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh
deleted file mode 100644
index 79e6c5b7e87..00000000000
--- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  topology="distilbert"
-  tuned_checkpoint="saved_results"
-  extra_cmd=""
-  batch_size=8
-  MAX_SEQ_LENGTH=128
-  model_type="bert"
-  approach="PostTrainingStatic"
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_model=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    if [ "${topology}" = "distilbert_swag" ]; then
-        script="run_swag.py"
-        model_name_or_path="Rocketknight1/bert-base-uncased-finetuned-swag"
-        approach="PostTrainingStatic"
-        # add following parameters for quicker debugging
-        extra_cmd=$extra_cmd" --max_train_samples 512 --max_eval_samples 1024 --perf_tol 0.035"
-    fi
-    
-    if [ "${worker}" = "" ]
-    then
-        python -u ${script} \
-            --model_name_or_path ${model_name_or_path} \
-            --output_dir ${tuned_checkpoint} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --tune \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            ${extra_cmd}
-    else
-        python -u ${script} \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --output_dir ${tuned_checkpoint} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --tune \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/text-classification/pruning/README.md b/examples/huggingface/tensorflow/text-classification/pruning/README.md
deleted file mode 100644
index b636b9de404..00000000000
--- a/examples/huggingface/tensorflow/text-classification/pruning/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-Step-by-Step
-=========
-
-This document describes the step-by-step instructions for reproducing the pruning on models for the text classification (GLUE) tasks.
-
-# Prerequisite
-## 1. Installation
-
-Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example:
-
-```shell
-pip install intel-extension-for-transformers
-pip install -r requirements.txt
-pip install transformers==4.34.1
-```
->**Note**: Please use transformers no higher than 4.34.1
-
-
-# Run
-
-## 1. Run Command (Shell)
-
-- Topology:
-   - distilbert_base_sst2
-
-```
-bash run_tuning.sh  --topology=[topology]
-```
-
-```
-bash run_benchmark.sh --topology=[topology] --mode=benchmark --use_pruned_model=true
-```
-
-## 2. Run Command (Python)
- 
-```
-python run_glue.py \    
-    --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \     
-    --task_name sst2 \     
-    --prune \      
-    --do_train \     
-    --do_eval \
-    --output_dir ./tmp/sst2_output \  
-    --overwrite_output_dir
-```
-
-# Multi-node Usage
-
-We also supported Distributed Data Parallel training on multi nodes settings for pruning.
-
-The default strategy we used is `MultiWorkerMirroredStrategy` in Tensorflow, and with `task_type` set as "worker", we are expected to pass following extra parameters to the script:
-
-* `worker`: a string of your worker ip addresses which is separated by comma and there should not be space between each two of them
-
-* `task_index`: 0 should be set on the chief node (leader) and 1, 2, 3... should be set as the rank of other follower nodes
-
-## Multi-node Example
-
-* On leader node
-
-```
-bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=0
-```
-
-which is equal to
-
-```
-python run_glue.py \    
-    --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \     
-    --task_name sst2 \     
-    --prune \      
-    --do_train \     
-    --do_eval \
-    --output_dir ./tmp/sst2_output \  
-    --overwrite_output_dir \
-    --worker "localhost:12345,localhost:23456" \
-    --task_index 0
-```
-
-* On follower node
-
-```
-bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=1
-```
-
-Please replace the worker ip address list with your own.
diff --git a/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt b/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt
deleted file mode 100644
index 245a729ec94..00000000000
--- a/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-accelerate
-datasets >= 1.17
-sentencepiece != 0.1.92
-protobuf
-intel-tensorflow
-transformers
diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh b/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh
deleted file mode 100644
index 76c9b07045f..00000000000
--- a/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  iters=100
-  batch_size=64
-  tuned_checkpoint=saved_results
-  topology="distilbert_base_sst2"
-  mode="benchmark"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --use_pruned_model=*)
-          use_pruned_model=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-    MAX_SEQ_LENGTH=128
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy_only"
-    elif [[ ${mode} == "benchmark" ]]; then
-        mode_cmd=" --benchmark "
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [ "${topology}" = "distilbert_base_sst2" ]; then
-        TASK_NAME='sst2'
-        model_name_or_path=distilbert-base-uncased-finetuned-sst-2-english
-    fi
-
-    if [[ ${use_pruned_model} == "true" ]]; then
-        extra_cmd=$extra_cmd" --use_pruned_model"
-    fi
-
-    python -u ./run_glue.py \
-        --model_name_or_path ${model_name_or_path} \
-        --task_name ${TASK_NAME} \
-        --do_eval \
-        --per_device_eval_batch_size ${batch_size} \
-        --output_dir ${tuned_checkpoint} \
-        --overwrite_cache \
-        ${mode_cmd} \
-        ${extra_cmd}
-
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py b/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py
deleted file mode 100644
index 67c35ff2471..00000000000
--- a/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py
+++ /dev/null
@@ -1,689 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
-# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-import numpy as np
-import tensorflow as tf
-import time
-import transformers
-from dataclasses import dataclass, field
-from typing import Optional
-
-from datasets import load_dataset, load_metric
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    DefaultDataCollator,
-    HfArgumentParser,
-    PretrainedConfig,
-    TFAutoModelForSequenceClassification,
-    TFTrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-
-
-# region Helper functions
-
-
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
-# endregion
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0")
-
-task_to_keys = {
-    "cola": ("sentence", None),
-    "mnli": ("premise", "hypothesis"),
-    "mrpc": ("sentence1", "sentence2"),
-    "qnli": ("question", "sentence"),
-    "qqp": ("question1", "question2"),
-    "rte": ("sentence1", "sentence2"),
-    "sst2": ("sentence", None),
-    "stsb": ("sentence1", "sentence2"),
-    "wnli": ("sentence1", "sentence2"),
-}
-
-logger = logging.getLogger(__name__)
-
-
-# region Command-line arguments
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    task_name: str = field(
-        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
-    )
-    predict_file: str = field(
-        metadata={"help": "A file containing user-supplied examples to make predictions for"},
-        default=None,
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to `max_seq_length`. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-            "value if set."
-        },
-    )
-
-    def __post_init__(self):
-        self.task_name = self.task_name.lower()
-        if self.task_name not in task_to_keys.keys():
-            raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-            "with private models)."
-        },
-    )
-
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    prune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply prune."},
-    )
-    pruning_approach: Optional[str] = field(
-        default="BasicMagnitude",
-        metadata={"help": "Pruning approach. Supported approach is basic_magnite."},
-    )
-    target_sparsity_ratio: Optional[float] = field(
-        default=None,
-        metadata={"help": "Targeted sparsity when pruning the model."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    tolerance_mode: Optional[str] = field(
-        default="relative",
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "Run benchmark."})
-    use_pruned_model: bool = field(
-        default=False,
-        metadata={"help":"Whether to use pretrained pruned model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-# endregion
-
-def main():
-    # region Argument parsing
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses()
-
-    if not (training_args.do_train or training_args.do_eval or training_args.do_predict):
-        exit("Must specify at least one of --do_train, --do_eval or --do_predict!")
-    # endregion
-
-    # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-    # region Checkpoints
-    checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        checkpoint = get_last_checkpoint(training_args.output_dir)
-        if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-    # endregion
-
-    # region Logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # region Dataset and labels
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
-    # that only one local process can concurrently download the dataset.
-    datasets = load_dataset(
-        "glue",
-        data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    is_regression = data_args.task_name == "stsb"
-    if not is_regression:
-        label_list = datasets["train"].features["label"].names
-        num_labels = len(label_list)
-    else:
-        num_labels = 1
-
-    if data_args.predict_file is not None:
-        logger.info("Preparing user-supplied file for predictions...")
-
-        data_files = {"data": data_args.predict_file}
-
-        for key in data_files.keys():
-            logger.info(f"Loading a local file for {key}: {data_files[key]}")
-
-        if data_args.predict_file.endswith(".csv"):
-            # Loading a dataset from local csv files
-            user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
-        else:
-            # Loading a dataset from local json files
-            user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
-        needed_keys = task_to_keys[data_args.task_name]
-        for key in needed_keys:
-            assert key in user_dataset["data"].features, f"Your supplied predict_file is missing the {key} key!"
-        datasets["user_data"] = user_dataset["data"]
-    # endregion
-
-    # region Load model config and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    # endregion
-
-    # region Dataset preprocessing
-    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
-    non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
-
-    # Padding strategy
-    if data_args.pad_to_max_length:
-        padding = "max_length"
-    else:
-        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
-        padding = False
-
-    # Some models have set the order of the labels to use, so let's make sure we do use it.
-    label_to_id = None
-    if config.label2id != PretrainedConfig(num_labels=num_labels).label2id and not is_regression:
-        # Some have all caps in their config, some don't.
-        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
-            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
-        else:
-            logger.warning(
-                "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
-                "\nIgnoring the model labels as a result.",
-            )
-            label_to_id = {label: i for i, label in enumerate(label_list)}
-    if label_to_id is not None:
-        config.label2id = label_to_id
-        config.id2label = {id: label for label, id in config.label2id.items()}
-    elif data_args.task_name is not None and not is_regression:
-        config.label2id = {l: i for i, l in enumerate(label_list)}
-        config.id2label = {id: label for label, id in config.label2id.items()}
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        args = (
-            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
-        )
-        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
-
-        return result
-
-    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
-
-    if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
-    else:
-        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
-    # endregion
-
-    # region Metric function
-    metric = load_metric("glue", data_args.task_name)
-
-    def compute_metrics(preds, label_ids):
-        preds = preds["logits"]
-        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=label_ids)
-        if len(result) > 1:
-            result["combined_score"] = np.mean(list(result.values())).item()
-        return result
-    # endregion
-
-    if distributed_args.worker is None:
-        strategy = training_args.strategy
-    
-    with strategy.scope():
-        # region Load pretrained model
-        if checkpoint is None:
-            model_path = model_args.model_name_or_path
-        else:
-            model_path = checkpoint
-        model = TFAutoModelForSequenceClassification.from_pretrained(
-            model_path,
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-        # endregion
-
-        # region Optimizer, loss and compilation
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
-        if is_regression:
-            loss_fn = tf.keras.losses.MeanSquaredError()
-            metrics = []
-        else:
-            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-                from_logits=True, reduction=tf.keras.losses.Reduction.SUM
-            )
-            metrics = ["accuracy"]
-        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-        # endregion
-
-        # region Convert data to a tf.data.Dataset
-        tf_data = dict()
-        max_samples = {
-            "train": data_args.max_train_samples,
-            "validation": data_args.max_eval_samples,
-            "validation_matched": data_args.max_eval_samples,
-            "validation_mismatched": data_args.max_eval_samples,
-            "test": data_args.max_predict_samples,
-            "test_matched": data_args.max_predict_samples,
-            "test_mismatched": data_args.max_predict_samples,
-            "user_data": None,
-        }
-
-        for key in datasets.keys():
-            if key == "train" or key.startswith("validation"):
-                assert "label" in datasets[key].features, f"Missing labels from {key} data!"
-            if key == "train":
-                shuffle = True
-                batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1)
-                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
-            else:
-                shuffle = False
-                batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1)
-                drop_remainder = False
-            samples_limit = max_samples[key]
-            dataset = datasets[key]
-            if samples_limit is not None:
-                dataset = dataset.select(range(samples_limit))
-            data = dataset.to_tf_dataset(
-                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
-                shuffle=shuffle,
-                batch_size=batch_size,
-                collate_fn=data_collator,
-                drop_remainder=drop_remainder,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                # datasets v2.3.x need "labels", not "label"
-                label_cols=["labels"] if "label" in dataset.column_names else None,
-            )
-            tf_data[key] = data
-        # endregion
-
-    # region Pruning
-    if optim_args.prune:
-        from intel_extension_for_transformers.transformers import metrics, PrunerConfig, PruningConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_data["train"],
-            eval_dataset=tf_data["validation"],
-            compute_metrics=compute_metrics,
-            criterion=loss_fn,
-            optimizer=optimizer,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=True, criterion=0.01,
-        )
-        prune_type = 'BasicMagnitude' \
-            if optim_args.pruning_approach else optim_args.pruning_approach
-        target_sparsity_ratio = None \
-            if optim_args.target_sparsity_ratio is None else optim_args.target_sparsity_ratio
-        pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)
-        pruning_conf = PruningConfig(
-            epochs=int(training_args.num_train_epochs), pruner_config=pruner_config, metrics=tune_metric,
-            framework="tensorflow"
-        )
-        p_model = optimization.prune(pruning_config=pruning_conf)
-        return
-    # endregion
-
-    # region Training and validation
-    if training_args.do_train:
-        callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
-        if training_args.do_eval and not data_args.task_name == "mnli":
-            # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
-            # because MNLI has two validation sets
-            validation_data = tf_data["validation"]
-        else:
-            validation_data = None
-        model.fit(
-            tf_data["train"],
-            validation_data=validation_data,
-            epochs=int(training_args.num_train_epochs),
-            callbacks=callbacks,
-        )
-    # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        # We normally do validation as part of the Keras fit loop, but we run it independently
-        # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
-        # because MNLI has a separate validation-mismatched validation set
-        logger.info("*** Evaluate ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        if data_args.task_name == "mnli":
-            tasks = ["mnli", "mnli-mm"]
-            tf_datasets = [tf_data["validation_matched"], tf_data["validation_mismatched"]]
-            raw_datasets = [datasets["validation_matched"], datasets["validation_mismatched"]]
-        else:
-            tasks = [data_args.task_name]
-            tf_datasets = [tf_data["validation"]]
-            raw_datasets = [datasets["validation"]]
-
-        total_time = 0
-        num_examples = 0
-        if optim_args.use_pruned_model:
-            model = tf.saved_model.load(training_args.output_dir)
-        for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
-                                                    tasks):
-            num_examples += sum(
-                1 for _ in (tf_dataset.unbatch()
-                            if hasattr(tf_dataset, "unbatch") else tf_dataset
-                            )
-            )
-            if optim_args.use_pruned_model:
-                preds: np.ndarray = None
-                label_ids: np.ndarray = None
-                infer = model.signatures[list(model.signatures.keys())[0]]
-                for i, (inputs, labels) in enumerate(tf_dataset):
-                    for name in inputs:
-                        inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-                    start = time.time()
-                    results = infer(**inputs)
-                    total_time += time.time() - start
-                    for val in results:
-                        if preds is None:
-                            preds = results[val].numpy()
-                        else:
-                            preds = np.append(preds, results[val].numpy(), axis=0)
-                    if label_ids is None:
-                        label_ids = labels.numpy()
-                    else:
-                        label_ids = np.append(label_ids, labels.numpy(), axis=0)
-                eval_metrics = compute_metrics({"logits": preds}, label_ids)
-            else:
-                start = time.time()
-                eval_predictions = model.predict(tf_dataset)
-                total_time += time.time() - start
-                eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
-                print(f"Evaluation metrics ({task}):")
-                print(eval_metrics)
-
-            logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"]))
-        logger.info(
-            "Throughput: {} samples/sec".format(
-                num_examples / total_time)
-        )
-    # endregion
-
-    # region Prediction
-    if training_args.do_predict or data_args.predict_file:
-        logger.info("*** Predict ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        tasks = []
-        tf_datasets = []
-        raw_datasets = []
-        if training_args.do_predict:
-            if data_args.task_name == "mnli":
-                tasks.extend(["mnli", "mnli-mm"])
-                tf_datasets.extend([tf_data["test_matched"], tf_data["test_mismatched"]])
-                raw_datasets.extend([datasets["test_matched"], datasets["test_mismatched"]])
-            else:
-                tasks.append(data_args.task_name)
-                tf_datasets.append(tf_data["test"])
-                raw_datasets.append(datasets["test"])
-        if data_args.predict_file:
-            tasks.append("user_data")
-            tf_datasets.append(tf_data["user_data"])
-            raw_datasets.append(datasets["user_data"])
-
-        if optim_args.use_pruned_model:
-            model = tf.saved_model.load(training_args.output_dir)
-
-        for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks):
-            if optim_args.use_pruned_model:
-                preds: np.ndarray = None
-                infer = model.signatures[list(model.signatures.keys())[0]]
-                for i, (inputs, labels) in enumerate(tf_dataset):
-                    for name in inputs:
-                        inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-                    results = infer(**inputs)
-                    for val in results:
-                        if preds is None:
-                            preds = results[val].numpy()
-                        else:
-                            preds = np.append(preds, results[val].numpy(), axis=0)
-                test_predictions = {"logits": preds}
-            else:
-                test_predictions = model.predict(tf_dataset)
-            if "label" in raw_dataset:
-                test_metrics = compute_metrics(test_predictions, raw_dataset["label"])
-                print(f"Test metrics ({task}):")
-                print(test_metrics)
-
-            if is_regression:
-                predictions_to_write = np.squeeze(test_predictions["logits"])
-            else:
-                predictions_to_write = np.argmax(test_predictions["logits"], axis=1)
-
-            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
-            with open(output_predict_file, "w") as writer:
-                logger.info(f"***** Writing prediction results for {task} *****")
-                writer.write("index\tprediction\n")
-                for index, item in enumerate(predictions_to_write):
-                    if is_regression:
-                        writer.write(f"{index}\t{item:3.3f}\n")
-                    else:
-                        item = config.id2label[item]
-                        writer.write(f"{index}\t{item}\n")
-        # endregion
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh b/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh
deleted file mode 100644
index 3fca9c69a4f..00000000000
--- a/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  tuned_checkpoint=saved_results
-  topology="distilbert_base_sst2"
-  # topology="bert_base_mrpc_static"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_model=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    extra_cmd=''
-    batch_size=64
-    if [ "${topology}" = "distilbert_base_sst2" ]; then
-        TASK_NAME='sst2'
-        model_name_or_path=distilbert-base-uncased-finetuned-sst-2-english
-    fi
-
-    if [ "${worker}" = "" ]
-    then
-        python -u ./run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --target_sparsity_ratio 0.1 \
-            --prune \
-            --do_eval \
-            --do_train \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --overwrite_cache
-    else
-        python -u ./run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --target_sparsity_ratio 0.1 \
-            --prune \
-            --do_eval \
-            --do_train \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --overwrite_cache \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/text-classification/quantization/README.md b/examples/huggingface/tensorflow/text-classification/quantization/README.md
deleted file mode 100644
index b590228ad0d..00000000000
--- a/examples/huggingface/tensorflow/text-classification/quantization/README.md
+++ /dev/null
@@ -1,132 +0,0 @@
-Step-by-Step
-=========
-
-This document describes the step-by-step instructions for reproducing the quantization on models for the text classification (GLUE) tasks.
-
-GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them:
-
-# Prerequisite
-## 1. Installation
-
-Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example:
-
-```shell
-pip install intel-extension-for-transformers
-cd ptq
-pip install -r requirements.txt
-```
-
-# Run
-
-Here are two options: running with the shell script or running with the python script. Basically, they are equivalent and the shell script just wraps the invocation of the python script and is more concise and easy for users to get started.
-
-## 1. Run Command (Shell)
-
-- Topology:
-   - bert_base_mrpc_static
-   - xlnet_mrpc
-   - albert_large_mrpc
-   - legalbert_mrpc
-
-- To get the int8 model
-
-   ```
-   cd ptq
-   bash run_tuning.sh  --topology=[topology] --output_model=./saved_int8
-   ```
-
-- To benchmark the int8 model
-
-   ```
-   cd ptq
-   bash run_benchmark.sh --topology=[topology] --config=./saved_int8 --mode=benchmark --int8=true
-   ```
-
-## 2. Run Command (Python)
-
-- model_name_or_path: 
-   - bert-base-cased-finetuned-mrpc
-   - xlnet-base-cased
-   - albert-large-v2
-   - nlpaueb/legal-bert-small-uncased
-
-- To get int8 model
-
-```
-python run_glue.py     
-    --model_name_or_path [model_name_or_path] \
-    --task_name mrpc \     
-    --tune \     
-    --quantization_approach static \     
-    --do_train \     
-    --do_eval \     
-    --output_dir ./saved_result \  
-    --overwrite_output_dir
-```
- - To reload int8 model
-
-```
-python run_glue.py     
-    --model_name_or_path [model_name_or_path] \
-    --task_name mrpc \     
-    --benchmark \
-    --int8 \
-    --do_eval \     
-    --output_dir ./saved_result \  
-    --overwrite_output_dir
-```
-
-> **Notes**:
- - quantization_approach in Tensorflow consist of `static`, `qat`.
- - task_name consist of cola, sst2, mrpc, stsb, qqp, mnli, qnli, rte, wnli.
-
-
-# Multi-node Usage
-
-We also supported Distributed Data Parallel training on multi nodes settings for quantization.
-
-> **Note**: multi node settings boost performance in the training process and may not show good performance with static quantization strategy
-
-The default strategy we used is `MultiWorkerMirroredStrategy` in Tensorflow, and with `task_type` set as "worker", we are expected to pass following extra parameters to the script:
-
-* `worker`: a string of your worker ip addresses which is separated by comma and there should not be space between each two of them
-
-* `task_index`: 0 should be set on the chief node (leader) and 1, 2, 3... should be set as the rank of other follower nodes
-
-## Multi-node Example
-
-### 1. Get Int8 Model
-
-* On leader node
-
-```
-bash run_tuning.sh --topology=bert_base_mrpc_static --output_model=./saved_int8 --worker="localhost:12345,localhost:23456"  --task_index=0
-```
-
-* On follower node
-
-```
-bash run_tuning.sh --topology=bert_base_mrpc_static --output_model=./saved_int8 --worker="localhost:12345,localhost:23456"  --task_index=1
-```
-
-Please replace the worker ip address list with your own.
-
-### 2. Reload Int8 Model
-
-* On leader node
-
-```
-bash run_benchmark.sh --topology=bert_base_mrpc_static --config=./saved_int8 --mode=benchmark --int8=true --worker="localhost:12345,localhost:23456"  --task_index=0
-```
-
-* On follower node
-
-```
-bash run_benchmark.sh --topology=bert_base_mrpc_static --config=./saved_int8 --mode=benchmark --int8=true --worker="localhost:12345,localhost:23456"  --task_index=1
-```
-
-Please replace the worker ip address list with your own.
-
-
-
-
diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt b/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt
deleted file mode 100644
index 8067cf9633a..00000000000
--- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets >= 1.17
-sentencepiece != 0.1.92
-protobuf
-intel-tensorflow
-transformers
-evaluate
-accelerate
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh b/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh
deleted file mode 100644
index 403a0e41b52..00000000000
--- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  topology="bert_base_mrpc_static"
-  iters=100
-  batch_size=1
-  tuned_checkpoint=saved_results
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-    MAX_SEQ_LENGTH=128
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy_only"
-    elif [[ ${mode} == "benchmark" ]]; then
-        mode_cmd=" --benchmark "
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [ "${topology}" = "bert_base_mrpc_static" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="bert-base-cased-finetuned-mrpc"
-    elif [ "${topology}" = "legalbert_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="nlpaueb/legal-bert-small-uncased"
-    elif [ "${topology}" = "xlnet_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="xlnet-base-cased"
-    elif [ "${topology}" = "albert_large_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="albert-large-v2"
-        # add following parameters for quicker debugging
-        extra_cmd=$extra_cmd" --max_eval_samples 48"
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-    if [ "${worker}" = "" ]
-    then
-        python -u ../run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --do_eval \
-            --max_seq_length ${MAX_SEQ_LENGTH} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --no_cuda \
-            ${mode_cmd} \
-            ${extra_cmd}
-    else
-        python -u ../run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --do_eval \
-            --max_seq_length ${MAX_SEQ_LENGTH} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --no_cuda \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${mode_cmd} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh b/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh
deleted file mode 100644
index c84c8654f62..00000000000
--- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  topology="bert_base_mrpc_static"
-  tuned_checkpoint="saved_results"
-  extra_cmd=""
-  batch_size=8
-  MAX_SEQ_LENGTH=128
-  model_type="bert"
-  approach="PostTrainingStatic"
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_model=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    batch_size=64
-    if [ "${topology}" = "bert_base_mrpc_static" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="bert-base-cased-finetuned-mrpc"
-        approach="PostTrainingStatic"
-    elif [ "${topology}" = "legalbert_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="nlpaueb/legal-bert-small-uncased"
-        approach="PostTrainingStatic"
-        extra_cmd=$extra_cmd" --perf_tol 0.1"
-    elif [ "${topology}" = "xlnet_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="xlnet-base-cased"
-        approach="PostTrainingStatic"
-    elif [ "${topology}" = "albert_large_mrpc" ]; then
-        TASK_NAME="mrpc"
-        model_name_or_path="albert-large-v2"
-        approach="PostTrainingStatic"
-        extra_cmd=$extra_cmd" --perf_tol 0.05"
-    fi
-    
-    if [ "${worker}" = "" ]
-    then
-        python -u ../run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --do_eval \
-            --max_seq_length ${MAX_SEQ_LENGTH} \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --no_cuda \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --tune \
-            ${extra_cmd}
-    else
-        python -u ../run_glue.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --do_eval \
-            --max_seq_length ${MAX_SEQ_LENGTH} \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --no_cuda \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --quantization_approach ${approach} \
-            --do_train \
-            --tune \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py b/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py
deleted file mode 100644
index b025ba2cc90..00000000000
--- a/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py
+++ /dev/null
@@ -1,731 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
-# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-import time
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-import tensorflow as tf
-from datasets import load_dataset
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    DefaultDataCollator,
-    HfArgumentParser,
-    PretrainedConfig,
-    TFAutoModelForSequenceClassification,
-    TFTrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-
-
-# region Helper functions
-
-
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
-# endregion
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0")
-
-task_to_keys = {
-    "cola": ("sentence", None),
-    "mnli": ("premise", "hypothesis"),
-    "mrpc": ("sentence1", "sentence2"),
-    "qnli": ("question", "sentence"),
-    "qqp": ("question1", "question2"),
-    "rte": ("sentence1", "sentence2"),
-    "sst2": ("sentence", None),
-    "stsb": ("sentence1", "sentence2"),
-    "wnli": ("sentence1", "sentence2"),
-}
-
-logger = logging.getLogger(__name__)
-
-
-# region Command-line arguments
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    task_name: str = field(
-        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
-    )
-    predict_file: str = field(
-        metadata={"help": "A file containing user-supplied examples to make predictions for"},
-        default=None,
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to `max_seq_length`. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-            "value if set."
-        },
-    )
-
-    def __post_init__(self):
-        self.task_name = self.task_name.lower()
-        if self.task_name not in task_to_keys.keys():
-            raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-            "with private models)."
-        },
-    )
-
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    tune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply quantization."},
-    )
-    quantization_approach: Optional[str] = field(
-        default="static",
-        metadata={"help": "Quantization approach. Supported approach are static, "
-                  "dynamic and qat."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    is_relative: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "run benchmark."})
-    int8: bool = field(
-        default=False,
-        metadata={"help":"Whether to use the quantized int8 model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-# endregion
-
-
-def main():
-    # region Argument parsing
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, optim_args, distributed_args  = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args, optim_args, distributed_args  = parser.parse_args_into_dataclasses()
-
-    if not (training_args.do_train or training_args.do_eval or training_args.do_predict):
-        exit("Must specify at least one of --do_train, --do_eval or --do_predict!")
-    # endregion
-
-    # region Logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-    # region Checkpoints
-    checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        checkpoint = get_last_checkpoint(training_args.output_dir)
-        if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-    # endregion
-
-    # region Dataset and labels
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
-    # that only one local process can concurrently download the dataset.
-    datasets = load_dataset(
-        "glue",
-        data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    is_regression = data_args.task_name == "stsb"
-    if not is_regression:
-        label_list = datasets["train"].features["label"].names
-        num_labels = len(label_list)
-    else:
-        num_labels = 1
-
-    if data_args.predict_file is not None:
-        logger.info("Preparing user-supplied file for predictions...")
-
-        data_files = {"data": data_args.predict_file}
-
-        for key in data_files.keys():
-            logger.info(f"Loading a local file for {key}: {data_files[key]}")
-
-        if data_args.predict_file.endswith(".csv"):
-            # Loading a dataset from local csv files
-            user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
-        else:
-            # Loading a dataset from local json files
-            user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
-        needed_keys = task_to_keys[data_args.task_name]
-        for key in needed_keys:
-            assert key in user_dataset["data"].features, f"Your supplied predict_file is missing the {key} key!"
-        datasets["user_data"] = user_dataset["data"]
-    # endregion
-
-    # region Load model config and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-    # endregion
-
-    # region Dataset preprocessing
-    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
-    non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
-
-    # Padding strategy
-    if data_args.pad_to_max_length:
-        padding = "max_length"
-    else:
-        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
-        padding = False
-
-    # Some models have set the order of the labels to use, so let's make sure we do use it.
-    label_to_id = None
-    if config.label2id != PretrainedConfig(num_labels=num_labels).label2id and not is_regression:
-        # Some have all caps in their config, some don't.
-        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
-        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
-            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
-        else:
-            logger.warning(
-                "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
-                "\nIgnoring the model labels as a result.",
-            )
-            label_to_id = {label: i for i, label in enumerate(label_list)}
-    if label_to_id is not None:
-        config.label2id = label_to_id
-        config.id2label = {id: label for label, id in config.label2id.items()}
-    elif data_args.task_name is not None and not is_regression:
-        config.label2id = {l: i for i, l in enumerate(label_list)}
-        config.id2label = {id: label for label, id in config.label2id.items()}
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        args = (
-            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
-        )
-        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
-
-        return result
-
-    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
-
-    if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
-    else:
-        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
-    # endregion
-
-    # region Metric function
-    from evaluate import load
-    metric = load("glue", data_args.task_name, cache_dir=model_args.cache_dir)
-
-    def compute_metrics(preds, label_ids):
-        preds = preds["logits"]
-        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=label_ids)
-        if len(result) > 1:
-            result["combined_score"] = np.mean(list(result.values())).item()
-        return result
-
-    # endregion
-
-    def eval_func_mrpc(model):
-        label_ids: np.ndarray = None
-        tf_eval_dataset = tf_data["validation"]
-
-        num_examples = sum(1 for _ in (
-            tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset))
-        logger.info(f"***** Running Evaluation *****")
-        logger.info(f"  Num examples in dataset = {num_examples}")
-        logger.info(f"  Batch size = {training_args.per_device_eval_batch_size}")
-
-        preds: np.ndarray = None
-        infer = model.signatures["serving_default"]
-
-        for idx, (inputs, labels) in enumerate(tf_eval_dataset):
-            for name in inputs:
-                inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-
-            results = infer(**inputs)
-            if preds is None:
-                preds = results["Identity"].numpy()
-            else:
-                preds = np.append(preds, results["Identity"].numpy(), axis=0)
-
-            if label_ids is None:
-                label_ids = labels[0].numpy() if isinstance(
-                    labels, list) else labels.numpy()
-            else:
-                label_ids = np.append(
-                    label_ids,
-                    labels[0].numpy()
-                    if isinstance(labels, list) else labels.numpy(),
-                    axis=0)
-        test_predictions = {"logits": preds}
-        metrics = compute_metrics(test_predictions, label_ids)
-
-        return metrics["accuracy"]
-
-    with strategy.scope():
-        # region Load pretrained model
-        if checkpoint is None:
-            model_path = model_args.model_name_or_path
-        else:
-            model_path = checkpoint
-        model = TFAutoModelForSequenceClassification.from_pretrained(
-            model_path,
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-        # endregion
-
-        # region Optimizer, loss and compilation
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
-        if is_regression:
-            loss_fn = tf.keras.losses.MeanSquaredError()
-            metrics = []
-        else:
-            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-                from_logits=True, reduction=tf.keras.losses.Reduction.SUM
-            )
-            metrics = ["accuracy"]
-        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-        # endregion
-
-        # region Convert data to a tf.data.Dataset
-        tf_data = dict()
-        max_samples = {
-            "train": data_args.max_train_samples,
-            "validation": data_args.max_eval_samples,
-            "validation_matched": data_args.max_eval_samples,
-            "validation_mismatched": data_args.max_eval_samples,
-            "test": data_args.max_predict_samples,
-            "test_matched": data_args.max_predict_samples,
-            "test_mismatched": data_args.max_predict_samples,
-            "user_data": None,
-        }
-        for key in datasets.keys():
-            if key == "train" or key.startswith("validation"):
-                assert "label" in datasets[key].features, f"Missing labels from {key} data!"
-            if key == "train":
-                shuffle = True
-                batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1)
-                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
-            else:
-                shuffle = False
-                batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1)
-                drop_remainder = False
-            samples_limit = max_samples[key]
-            dataset = datasets[key]
-            if samples_limit is not None:
-                dataset = dataset.select(range(samples_limit))
-            data = dataset.to_tf_dataset(
-                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
-                shuffle=shuffle,
-                batch_size=batch_size,
-                collate_fn=data_collator,
-                drop_remainder=drop_remainder,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                # datasets v2.3.x need "labels", not "label"
-                label_cols=["labels"] if "label" in dataset.column_names else None,
-            )
-            tf_data[key] = data
-        # endregion
-
-    if optim_args.tune:
-        from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_data["train"],
-            eval_dataset=tf_data["validation"],
-            compute_metrics=compute_metrics,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-
-        # use customized eval function
-        optimization.eval_func = eval_func_mrpc
-
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol,
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = optimization.quantize(quant_config=quantization_config)
-        exit(0)
-
-    # region Training and validation
-    if training_args.do_train:
-        callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
-        if training_args.do_eval and not data_args.task_name == "mnli":
-            # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
-            # because MNLI has two validation sets
-            validation_data = tf_data["validation"]
-        else:
-            validation_data = None
-        model.fit(
-            tf_data["train"],
-            validation_data=validation_data,
-            epochs=2,
-            callbacks=callbacks,
-        )
-    # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        # We normally do validation as part of the Keras fit loop, but we run it independently
-        # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
-        # because MNLI has a separate validation-mismatched validation set
-        logger.info("*** Evaluate ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        if data_args.task_name == "mnli":
-            tasks = ["mnli", "mnli-mm"]
-            tf_datasets = [tf_data["validation_matched"], tf_data["validation_mismatched"]]
-            raw_datasets = [datasets["validation_matched"], datasets["validation_mismatched"]]
-        else:
-            tasks = [data_args.task_name]
-            tf_datasets = [tf_data["validation"]]
-            raw_datasets = [datasets["validation"]]
-
-        num_examples = 0
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-        else:
-            from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel
-            model = keras2SavedModel(model)
-        for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks):
-            num_examples += sum(
-                1 for _ in (tf_dataset.unbatch()
-                            if hasattr(tf_dataset, "unbatch") else tf_dataset
-                            )
-            )
-            preds: np.ndarray = None
-            label_ids: np.ndarray = None
-            infer = model.signatures[list(model.signatures.keys())[0]]
-
-            if optim_args.accuracy_only:
-                iterations = 1
-                warmup = 0
-            else:
-                iterations = 10
-                warmup = 5
-            latency_list = []
-
-            for idx in range(iterations):
-                iteration_time = 0
-                for i, (inputs, labels) in enumerate(tf_dataset):
-                    for name in inputs:
-                        inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-                    start = time.time()
-                    results = infer(**inputs)
-                    iteration_time += time.time() - start
-                    if idx == 0:    # only accumulate once all the preds and labels
-                        if preds is None:
-                            preds = results["Identity"].numpy()
-                        else:
-                            preds = np.append(preds, results["Identity"].numpy(), axis=0)
-                        if label_ids is None:
-                            label_ids = labels.numpy()
-                        else:
-                            label_ids = np.append(label_ids, labels.numpy(), axis=0)
-                latency_list.append(iteration_time)
-                logger.info("Iteration {} time: {} sec".format(idx, iteration_time))
-            eval_metrics = compute_metrics({"logits": preds}, label_ids)
-        logger.info("\nEvaluation result: ")
-        logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"]))
-
-        average_iteration_time = np.array(latency_list[warmup:]).mean()
-        logger.info(
-            "Throughput: {} samples/sec".format(
-                num_examples / average_iteration_time)
-        )
-
-    # endregion
-
-    # region Prediction
-    if training_args.do_predict or data_args.predict_file:
-        logger.info("*** Predict ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        tasks = []
-        tf_datasets = []
-        raw_datasets = []
-        if training_args.do_predict:
-            if data_args.task_name == "mnli":
-                tasks.extend(["mnli", "mnli-mm"])
-                tf_datasets.extend([tf_data["test_matched"], tf_data["test_mismatched"]])
-                raw_datasets.extend([datasets["test_matched"], datasets["test_mismatched"]])
-            else:
-                tasks.append(data_args.task_name)
-                tf_datasets.append(tf_data["test"])
-                raw_datasets.append(datasets["test"])
-        if data_args.predict_file:
-            tasks.append("user_data")
-            tf_datasets.append(tf_data["user_data"])
-            raw_datasets.append(datasets["user_data"])
-
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-
-        for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks):
-            if optim_args.int8:
-                preds: np.ndarray = None
-                infer = model.signatures[list(model.signatures.keys())[0]]
-                for i, (inputs, labels) in enumerate(tf_dataset):
-                    for name in inputs:
-                        inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-                    results = infer(**inputs)
-                    for val in results:
-                        if preds is None:
-                            preds = results[val].numpy()
-                        else:
-                            preds = np.append(preds, results[val].numpy(), axis=0)
-                test_predictions = {"logits": preds}
-            else:
-                test_predictions = model.predict(tf_dataset)
-            if "label" in raw_dataset:
-                test_metrics = compute_metrics(test_predictions,
-                                                raw_dataset["label"])
-                print(f"Test metrics ({task}):")
-                print(test_metrics)
-
-            if is_regression:
-                predictions_to_write = np.squeeze(test_predictions["logits"])
-            else:
-                predictions_to_write = np.argmax(test_predictions["logits"], axis=1)
-
-            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
-            with open(output_predict_file, "w") as writer:
-                logger.info(f"***** Writing prediction results for {task} *****")
-                writer.write("index\tprediction\n")
-                for index, item in enumerate(predictions_to_write):
-                    if is_regression:
-                        writer.write(f"{index}\t{item:3.3f}\n")
-                    else:
-                        item = config.id2label[item]
-                        writer.write(f"{index}\t{item}\n")
-    # endregion
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/huggingface/tensorflow/token-classification/quantization/README.md b/examples/huggingface/tensorflow/token-classification/quantization/README.md
deleted file mode 100644
index 8b05a9c1974..00000000000
--- a/examples/huggingface/tensorflow/token-classification/quantization/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-Step-by-Step
-=========
-
-This document describes the step-by-step instructions for reproducing the quantization on models for the token classification (NER) tasks.
-
-# Prerequisite
-## 1. Installation
-
-Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example:
-
-```shell
-pip install intel-extension-for-transformers
-pip install -r requirements.txt
-```
-
-# Run
-
-## 1. Run Command (Shell)
-
-- Topology:
-   - bert_base_ner
-
-- To get the int8 model
-
-   ```
-   cd ptq
-   bash run_tuning.sh  --topology=[topology] --output_model=./saved_int8
-   ```
-
-- To benchmark the int8 model
-
-   ```
-   cd ptq
-   bash run_benchmark.sh --topology=[topology] --config=./saved_int8 --mode=benchmark --int8=true
-   ```
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt b/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt
deleted file mode 100644
index 6e419404871..00000000000
--- a/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets >= 1.17
-sentencepiece != 0.1.92
-seqeval
-protobuf
-intel-tensorflow
-transformers
-accelerate
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh b/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh
deleted file mode 100644
index ddf9d917410..00000000000
--- a/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  topology="bert_base_ner"
-  iters=100
-  batch_size=16
-  tuned_checkpoint=saved_results
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-    MAX_SEQ_LENGTH=128
-    batch_size=1
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy_only"
-    elif [[ ${mode} == "benchmark" ]]; then
-        mode_cmd=" --benchmark "
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [ "${topology}" = "bert_base_ner" ]; then
-        TASK_NAME="ner"
-        model_name_or_path="dslim/bert-base-NER"
-        approach="PostTrainingStatic"
-        dataset_name=conll2003
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-    if [ "${worker}" = "" ]
-    then
-        python -u run_ner.py \
-            --model_name_or_path ${model_name_or_path} \
-            --dataset_name ${dataset_name} \
-            --task_name ${TASK_NAME} \
-            --pad_to_max_length \
-            --do_eval \
-            --max_length ${MAX_SEQ_LENGTH} \
-            --per_device_eval_batch_size ${batch_size} \
-            --max_eval_samples 408 \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --no_cuda \
-            ${mode_cmd} \
-            ${extra_cmd}
-    else
-        python -u ../run_ner.py \
-            --model_name_or_path ${model_name_or_path} \
-            --task_name ${TASK_NAME} \
-            --dataset_name ${dataset_name} \
-            --pad_to_max_length \
-            --do_eval \
-            --max_length ${MAX_SEQ_LENGTH} \
-            --per_device_eval_batch_size ${batch_size} \
-            --max_eval_samples 408 \
-            --output_dir ${tuned_checkpoint} \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --no_cuda \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${mode_cmd} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py b/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py
deleted file mode 100644
index 30b9855c97f..00000000000
--- a/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py
+++ /dev/null
@@ -1,696 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library
-without using a Trainer.
-"""
-
-import logging
-import sys
-import random
-import time
-from dataclasses import dataclass, field
-from typing import Optional
-
-from datasets import ClassLabel, load_dataset, load_metric
-import numpy as np
-import tensorflow as tf
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    DataCollatorForTokenClassification,
-    HfArgumentParser,
-    TFAutoModelForTokenClassification,
-    TFTrainingArguments,
-    set_seed,
-)
-from transformers.utils.versions import require_version
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-logger = logging.getLogger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
-
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
-# region Command-line arguments
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
-    )
-    text_column_name: Optional[str] = field(
-        default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
-    )
-    label_column_name: Optional[str] = field(
-        default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_length: Optional[int] = field(default=256, metadata={"help": "Max length (in tokens) for truncation/padding"})
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to model maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for TPU."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    label_all_tokens: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to put the label for one word on all tokens of generated by that word or just on the "
-                "one (in which case the other tokens will have a padding index)."
-            )
-        },
-    )
-    return_entity_level_metrics: bool = field(
-        default=False,
-        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        self.task_name = self.task_name.lower()
-
-@dataclass
-class OptimizationArguments:
-    """
-    Arguments pertaining to what type of optimization we are going to apply on the model.
-    """
-
-    tune: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to apply quantization."},
-    )
-    quantization_approach: Optional[str] = field(
-        default="PostTrainingStatic",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                  "PostTrainingDynamic and QuantizationAwareTraining."},
-    )
-    metric_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Metric used for the tuning strategy."},
-    )
-    is_relative: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
-    )
-    perf_tol: Optional[float] = field(
-        default=0.01,
-        metadata={"help": "Performance tolerance when optimizing the model."},
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "run benchmark."})
-    int8: bool = field(
-        default=False,
-        metadata={"help":"Whether to use the quantized int8 model."})
-    accuracy_only: bool = field(
-        default=False,
-        metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."})
-
-@dataclass
-class DistributedArguments:
-    """
-    Arguments setting the distributed multinode environment
-    """
-
-    worker: str = field(
-        default=None,
-        metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."},
-    )
-    task_index: int = field(
-        default=0,
-        metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."},
-    )
-
-# endregion
-
-
-def main():
-    # region Argument Parsing
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments))
-    model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses()
-    # endregion
-
-    # region Setup logging
-    # we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    # region Logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # endregion
-
-    # If passed along, set the training seed now.
-    if training_args.seed is not None:
-        set_seed(training_args.seed)
-    # endregion
-
-    # region Set the multinode environment, the strategy and paths
-    strategy = None
-    worker_list = None
-    if distributed_args.worker is not None:
-        logger.info("distributed environment initialization...")
-
-        worker_list = distributed_args.worker.split(",")
-
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(worker_list, "worker", distributed_args.task_index)
-
-        strategy = tf.distribute.MultiWorkerMirroredStrategy()
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
-    else:
-        strategy = training_args.strategy
-    #endregion
-
-    # region Loading datasets
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
-    # 'tokens' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            use_auth_token=True if model_args.use_auth_token else None,
-            cache_dir=model_args.cache_dir,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
-            cache_dir=model_args.cache_dir,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    if raw_datasets["train"] is not None:
-        column_names = raw_datasets["train"].column_names
-        features = raw_datasets["train"].features
-    else:
-        column_names = raw_datasets["validation"].column_names
-        features = raw_datasets["validation"].features
-
-    if data_args.text_column_name is not None:
-        text_column_name = data_args.text_column_name
-    elif "tokens" in column_names:
-        text_column_name = "tokens"
-    else:
-        text_column_name = column_names[0]
-
-    if data_args.label_column_name is not None:
-        label_column_name = data_args.label_column_name
-    elif f"{data_args.task_name}_tags" in column_names:
-        label_column_name = f"{data_args.task_name}_tags"
-    else:
-        label_column_name = column_names[1]
-
-    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
-    # unique labels.
-    def get_label_list(labels):
-        unique_labels = set()
-        for label in labels:
-            unique_labels = unique_labels | set(label)
-        label_list = list(unique_labels)
-        label_list.sort()
-        return label_list
-
-    if isinstance(features[label_column_name].feature, ClassLabel):
-        label_list = features[label_column_name].feature.names
-        # No need to convert the labels since they are already ints.
-        label_to_id = {i: i for i in range(len(label_list))}
-    else:
-        label_list = get_label_list(raw_datasets["train"][label_column_name])
-        label_to_id = {l: i for i, l in enumerate(label_list)}
-    num_labels = len(label_list)
-    # endregion
-
-    # region Load config and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        _commit_hash="main",
-    )
-
-    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
-    if not tokenizer_name_or_path:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if config.model_type in {"gpt2", "roberta"}:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True,
-         add_prefix_space=True, _commit_hash="main",)
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True,
-         _commit_hash="main",)
-    # endregion
-    
-    # region Preprocessing the raw datasets
-    # First we tokenize all the texts.
-    # should always use padding because the current ptq does not use tf > 2.8
-    # so no RaggedTensor is supported
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    # Tokenize all texts and align the labels with them.
-
-    def tokenize_and_align_labels(examples):
-        tokenized_inputs = tokenizer(
-            examples[text_column_name],
-            max_length=data_args.max_length,
-            padding=padding,
-            truncation=True,
-            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
-        )
-
-        labels = []
-        for i, label in enumerate(examples[label_column_name]):
-            word_ids = tokenized_inputs.word_ids(batch_index=i)
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
-                # ignored in the loss function.
-                if word_idx is None:
-                    label_ids.append(-100)
-                # We set the label for the first token of each word.
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_to_id[label[word_idx]])
-                # For the other tokens in a word, we set the label to either the current label or -100, depending on
-                # the label_all_tokens flag.
-                else:
-                    label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
-                previous_word_idx = word_idx
-
-            labels.append(label_ids)
-        tokenized_inputs["labels"] = labels
-        return tokenized_inputs
-
-    processed_raw_datasets = raw_datasets.map(
-        tokenize_and_align_labels,
-        batched=True,
-        remove_columns=raw_datasets["train"].column_names,
-        desc="Running tokenizer on dataset",
-    )
-
-    train_dataset = processed_raw_datasets["train"]
-    eval_dataset = processed_raw_datasets["validation"]
-
-    if data_args.max_train_samples is not None:
-        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-        train_dataset = train_dataset.select(range(max_train_samples))
-
-    if data_args.max_eval_samples is not None:
-        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-        eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-    # endregion
-
-    # Metrics
-    metric = load_metric("seqeval")
-
-    def get_labels(y_pred, y_true):
-        # Transform predictions and references tensos to numpy arrays
-
-        # Remove ignored index (special tokens)
-        true_predictions = [
-            [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
-            for pred, gold_label in zip(y_pred, y_true)
-        ]
-        true_labels = [
-            [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
-            for pred, gold_label in zip(y_pred, y_true)
-        ]
-        return true_predictions, true_labels
-
-    def compute_metrics(predictions, labels):
-        predictions = predictions["logits"]
-        predictions = np.argmax(predictions, axis=-1)
-
-        attention_mask = eval_dataset.with_format("tf")["attention_mask"]
-        labels[attention_mask == 0] = -100
-
-        # Remove ignored index (special tokens)
-        preds, refs = get_labels(predictions, labels)
-
-        metric.add_batch(
-            predictions=preds,
-            references=refs,
-        )
-        results = metric.compute()
-
-        if data_args.return_entity_level_metrics:
-            # Unpack nested dictionaries
-            final_results = {}
-            for key, value in results.items():
-                if isinstance(value, dict):
-                    for n, v in value.items():
-                        final_results[f"{key}_{n}"] = v
-                else:
-                    final_results[key] = value
-            return final_results
-        else:
-            return {
-                "precision": results["overall_precision"],
-                "recall": results["overall_recall"],
-                "f1": results["overall_f1"],
-                "accuracy": results["overall_accuracy"],
-            }
-
-    # endregion
-
-    with strategy.scope():
-        # region Initialize model
-        if model_args.model_name_or_path:
-            model = TFAutoModelForTokenClassification.from_pretrained(
-                model_args.model_name_or_path,
-                config=config,
-                cache_dir=model_args.cache_dir,
-                revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-        else:
-            logger.info("Training new model from scratch")
-            model = TFAutoModelForTokenClassification.from_config(config)
-
-        model.resize_token_embeddings(len(tokenizer))
-        # endregion
-
-        # region Create TF datasets
-
-        # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
-        # well as inputs.
-        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
-        total_train_batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1)
-
-        dataset_options = tf.data.Options()
-        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-
-        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
-        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
-        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
-        # yourself if you use this method, whereas they are automatically inferred from the model input names when
-        # using model.prepare_tf_dataset()
-        # For more info see the docs:
-        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
-        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
-
-        tf_train_dataset = model.prepare_tf_dataset(
-            train_dataset,
-            collate_fn=collate_fn,
-            batch_size=total_train_batch_size,
-            shuffle=True,
-        ).with_options(dataset_options)
-        total_eval_batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1)
-        tf_eval_dataset = model.prepare_tf_dataset(
-            eval_dataset,
-            collate_fn=collate_fn,
-            batch_size=total_eval_batch_size,
-            shuffle=False,
-        ).with_options(dataset_options)
-
-        # endregion
-
-        # region Optimizer, loss and compilation
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
-
-        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
-        # endregion
-
-    if optim_args.tune:
-        from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization
-        optimization = TFOptimization(
-            model=model,
-            args=training_args,
-            train_dataset=tf_train_dataset,
-            eval_dataset=tf_eval_dataset,
-            compute_metrics=compute_metrics,
-            task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-            task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None,
-        )
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol,
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = optimization.quantize(quant_config=quantization_config)
-        exit(0)
-
-    # region Training
-    if training_args.do_train:
-        callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {len(train_dataset)}")
-        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size = {total_train_batch_size}")
-        # Only show the progress bar once on each machine.
-
-        model.fit(
-            tf_train_dataset,
-            validation_data=tf_eval_dataset,
-            epochs=int(training_args.num_train_epochs),
-            callbacks=callbacks,
-        )
-    # endregion
-
-    # region Evaluation
-    if training_args.do_eval:
-        # We normally do validation as part of the Keras fit loop, but we run it independently
-        # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
-        # because MNLI has a separate validation-mismatched validation set
-        logger.info("*** Evaluate ***")
-
-        tasks = [data_args.task_name]
-        tf_datasets = [tf_eval_dataset]
-        raw_datasets = [processed_raw_datasets["validation"]]
-
-        num_examples = 0
-
-        if optim_args.int8:
-            model = tf.saved_model.load(training_args.output_dir)
-        else:
-            from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel
-            model = keras2SavedModel(model)
-
-        for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks):
-            num_examples += sum(
-                1 for _ in (tf_dataset.unbatch()
-                            if hasattr(tf_dataset, "unbatch") else tf_dataset
-                            )
-            )
-
-            preds: np.ndarray = None
-            label_ids: np.ndarray = None
-            infer = model.signatures[list(model.signatures.keys())[0]]
-
-            if optim_args.accuracy_only:
-                iterations = 1
-                warmup = 0
-            else:
-                iterations = 10
-                warmup = 5
-            latency_list = []
-
-            for idx in range(iterations):
-                iteration_time = 0
-                for i, (inputs, labels) in enumerate(tf_dataset):
-                    for name in inputs:
-                        inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype)
-                    start = time.time()
-                    results = infer(**inputs)
-                    iteration_time += time.time() - start
-                    if idx == 0:    # only accumulate once all the preds and labels
-                        for val in results:
-                            if preds is None:
-                                preds = results[val].numpy()
-                            else:
-                                preds = np.append(preds, results[val].numpy(), axis=0)
-                        if label_ids is None:
-                            label_ids = labels.numpy()
-                        else:
-                            label_ids = np.append(label_ids, labels.numpy(), axis=0)
-
-                latency_list.append(iteration_time)
-                logger.info("Iteration {} time: {} sec".format(idx, iteration_time))
-            eval_metrics = compute_metrics({"logits": preds}, label_ids)
-            logger.info("\nEvaluation result: ")
-            logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"]))
-
-            average_iteration_time = np.array(latency_list[warmup:]).mean()
-            logger.info(
-                "Throughput: {} samples/sec".format(
-                    num_examples / average_iteration_time)
-            )
-    # endregion
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh b/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh
deleted file mode 100644
index 415cf26ddd1..00000000000
--- a/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  topology="bert_base_ner"
-  tuned_checkpoint="saved_results"
-  extra_cmd=""
-  batch_size=8
-  MAX_SEQ_LENGTH=128
-  model_type="bert"
-  approach="PostTrainingStatic"
-  cache_dir="cache"
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --output_model=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      --worker=*)
-          worker=$(echo $var |cut -f2 -d=)
-      ;;
-      --task_index=*)
-          task_index=$(echo $var |cut -f2 -d=)
-      ;;
-      --cache_dir=*)
-          cache_dir=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    batch_size=64
-    if [ "${topology}" = "bert_base_ner" ]; then
-        TASK_NAME="ner"
-        model_name_or_path="dslim/bert-base-NER"
-        approach="PostTrainingStatic"
-        dataset_name=conll2003
-    fi
-    
-    if [ "${worker}" = "" ]
-    then
-        python -u run_ner.py \
-            --model_name_or_path ${model_name_or_path} \
-            --dataset_name ${dataset_name} \
-            --task_name ${TASK_NAME} \
-            --pad_to_max_length \
-            --do_eval \
-            --max_length ${MAX_SEQ_LENGTH} \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --no_cuda \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --quantization_approach ${approach} \
-            --tune \
-            ${extra_cmd}
-    else
-        python -u run_ner.py \
-            --model_name_or_path ${model_name_or_path} \
-            --dataset_name ${dataset_name} \
-            --task_name ${TASK_NAME} \
-            --pad_to_max_length \
-            --do_eval \
-            --max_length ${MAX_SEQ_LENGTH} \
-            --per_device_train_batch_size ${batch_size} \
-            --per_device_eval_batch_size ${batch_size} \
-            --output_dir ${tuned_checkpoint} \
-            --no_cuda \
-            --overwrite_output_dir \
-            --cache_dir ${cache_dir} \
-            --quantization_approach ${approach} \
-            --tune \
-            --worker "${worker}" \
-            --task_index ${task_index} \
-            ${extra_cmd}
-    fi
-}
-
-main "$@"
diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py
index c7b129f1ba0..300ba84b71e 100644
--- a/intel_extension_for_transformers/transformers/__init__.py
+++ b/intel_extension_for_transformers/transformers/__init__.py
@@ -31,7 +31,7 @@
     SUPPORTED_DISTILLATION_CRITERION_MODE,
     DistillationCriterionMode,
 )
-from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer
+
 from .optimizer_tf import TFOptimization
 from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode
 from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode
diff --git a/intel_extension_for_transformers/transformers/optimizer.py b/intel_extension_for_transformers/transformers/optimizer.py
deleted file mode 100644
index de952776ffa..00000000000
--- a/intel_extension_for_transformers/transformers/optimizer.py
+++ /dev/null
@@ -1,447 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Optimization: provides the orchestrate optimizer for Pytorch."""
-import logging
-import os
-import shlex
-
-from neural_compressor.experimental import(
-    common,
-    Component,
-    Distillation,
-    Quantization,
-    Pruning,
-)
-from neural_compressor.training import prepare_compression
-from neural_compressor.quantization import fit
-from neural_compressor.config import (
-    PostTrainingQuantConfig,
-    QuantizationAwareTrainingConfig,
-)
-from neural_compressor.experimental.scheduler import Scheduler
-from intel_extension_for_transformers.transformers import(
-    DistillationConfig,
-    Provider,
-    PruningConfig
-)
-from neural_compressor.config import (
-    PostTrainingQuantConfig,
-    QuantizationAwareTrainingConfig,
-)
-from intel_extension_for_transformers.transformers.utils.utility import LazyImport
-from intel_extension_for_transformers.transformers.quantization import QuantizationMode
-from transformers import PreTrainedModel, PretrainedConfig
-from transformers.file_utils import WEIGHTS_NAME
-from typing import Callable, Optional, Union, List
-
-torch = LazyImport("torch")
-
-logger = logging.getLogger(__name__)
-
-
-class Orchestrate_optimizer:
-    """Orchestrate_optimizer aggregates and orchestrates components such as Quantization, Pruning and Distillation."""
-    def __init__(
-        self,
-        model,
-        components: Optional[List[Component]] = [],
-        eval_func: Optional[Callable] = None,
-        train_func: Optional[Callable] = None,
-        output_dir: Optional[str] = "saved_results",
-    ):
-        """Init an orchestrate optimizer.
-
-        Args:
-            model: Model to quantize and/or prune.
-            components: List of Component objects which contains Quantization, Pruning, Distillation objects.
-            eval_func: Evaluation function to evaluate the tuning objective.
-            train_func: Training function which will be combined with pruning.
-        """
-        if len(components) == 0:
-            raise RuntimeError("`NLPOptimizer` requires at least one `Quantization`, "
-                               "`Pruning` or `Distillation` object")
-        self.output_dir = output_dir
-        if hasattr(model, 'config') and isinstance(model.config, PretrainedConfig):
-            self.model_config = model.config
-        self.enable_inc_quant = False
-        self.enable_inc_pruning = False
-        self.scheduler = Scheduler()
-        self.scheduler.model = common.Model(model)
-
-        if len(components) > 1:
-            agent = self.scheduler.combine(*components)
-            agent.train_func = train_func
-            agent.eval_func = eval_func
-            for component in components:
-                if isinstance(component, Distillation) and hasattr(component, 'criterion'):
-                    agent.criterion = component.criterion
-                if isinstance(component, Quantization):
-                    self.enable_inc_quant = True
-                if isinstance(component, Pruning):
-                    self.enable_inc_pruning = True
-            self.scheduler.append(agent)
-        else:
-            self.scheduler.append(*components)
-
-    def fit(self):
-        """Run the scheduler."""
-        self.opt_model = self.scheduler()
-        self.save_model(self.output_dir)
-        if self.enable_inc_pruning == True:
-            stats, sparsity = self.opt_model.report_sparsity()
-            logger.info(stats)
-            logger.info(sparsity)
-        return self.opt_model.model
-
-    def save_model(self, output_dir, tokenizer=None):
-        """Save the model and tokenizer in the output directory.
-
-        Args:
-            output_dir: the path to save config.json and pytorch_model.bin.
-            tokenizer (object, optional): the tokenizer object, use it if you want to
-                                          save tokenizer.json in output_dir. Defaults to None.
-        """
-        os.makedirs(shlex.quote(output_dir), exist_ok=True)
-        torch.save(self.opt_model.quantized_state_dict(), os.path.join(shlex.quote(output_dir), WEIGHTS_NAME))
-        if hasattr(self, 'model_config') and isinstance(self.model_config, PretrainedConfig):
-            if self.enable_inc_quant == True:
-                self.model_config.torch_dtype = "int8"
-            self.model_config.save_pretrained(output_dir)
-        if tokenizer:   # pragma: no cover
-            tokenizer.save_pretrained(output_dir)
-        logger.info("orchestrate_optimizations model and configure file have saved to {}".format(
-                    output_dir))
-
-
-class NoTrainerOptimizer:   # pragma: no cover
-    """Optimizer without using Trainer."""
-    def __init__(
-        self,
-        model,
-        output_dir: Optional[str] = "saved_results",
-    ):
-        """Init a NoTrainerOptimizer object.
-
-        Args:
-            model: FP32 model specified for low precision tuning.
-            output_dir: The folder for saving the results.
-        """
-        self.model = model
-        self.teacher_model = None
-        self._eval_func = None
-        self._train_func = None
-        self._calib_func = None
-        self._calib_dataloader = None
-        self.output_dir = output_dir
-        self.quant_config = None
-        self.pruning_config = None
-        self.distillation_config = None
-        self._provider = Provider.INC.value
-        self.pruner = None
-        self.quantizer = None
-        self.distiller = None
-        self.in_training = False
-        self.enable_inc_quant = False
-
-    @property
-    def eval_func(self):
-        """Get the evaluation function."""
-        return self._eval_func
-
-    @property
-    def train_func(self):
-        """Get the train function."""
-        return self._train_func
-
-    @property
-    def calib_func(self):
-        """Get the calib function."""
-        return self._calib_func
-
-    @property
-    def provider(self):
-        """Get the provider."""
-        return self._provider
-
-    @property
-    def calib_dataloader(self):
-        """Get the calibration dataloader."""
-        return self._calib_dataloader
-
-    @eval_func.setter
-    def eval_func(self, func: Callable):
-        """Set the evaluation function.
-
-        Args:
-            func: evaluation function.
-        """
-        self._eval_func = func
-
-    @train_func.setter
-    def train_func(self, func: Callable):
-        """Set the train function.
-
-        Args:
-            func: train function.
-        """
-        self._train_func = func
-
-    @provider.setter
-    def provider(self, provider):
-        """Set the provider.
-
-        Args:
-            provider: optimization provider.
-        """
-        self._provider = provider
-
-    @calib_dataloader.setter
-    def calib_dataloader(self, dataloader):
-        """Set the calibration dataloader.
-
-        Args:
-            dataloader: calibration dataloader.
-        """
-        # transformer issue #1
-        if dataloader.batch_size is None:
-            from .utils.utility import _build_inc_dataloader
-            self._calib_dataloader = _build_inc_dataloader(dataloader)
-        else:
-            self._calib_dataloader = dataloader
-
-    def _inc_quantize(
-        self,
-        quant_config,
-        provider: str = Provider.INC.value,
-    ):
-        """Do the quantization."""
-        if self._eval_func is not None:
-            self.quantizer.eval_func = self._eval_func
-        if self._calib_func is not None:
-            self.quantizer.calib_func = self._calib_func
-        if isinstance(quant_config, PostTrainingQuantConfig):
-            if quant_config.backend == "ipex":
-                self.model_config = self.model.config # jit model will loss config
-            if self._calib_dataloader is None:
-                self._calib_dataloader = self.get_train_dataloader()
-            self.opt_model = fit(self.model,
-                                 conf=quant_config,
-                                 calib_dataloader=self._calib_dataloader,
-                                 eval_func=self._eval_func)
-        else:
-            compression_manager = prepare_compression(self.model, quant_config)
-            compression_manager.callbacks.on_train_begin()
-            self.train()
-            compression_manager.callbacks.on_train_end()
-            self.opt_model = compression_manager.model
-        self.enable_inc_quant = True
-        self.save_model(self.args.output_dir)
-        return self.opt_model.model
-
-    def quantize(
-        self,
-        quant_config: Union[PostTrainingQuantConfig, QuantizationAwareTrainingConfig] = None,
-        provider: str = Provider.INC.value,
-        eval_func: Optional[Callable] = None,
-        train_func: Optional[Callable] = None,
-        calib_func: Optional[Callable] = None,
-        calib_dataloader=None,
-    ):
-        """Prepare for invoking the _inc_quantize function.
-
-        Args:
-            quant_config: quantization config.
-            provider: define the quantization provider.
-            eval_func: evaluation function.
-            train_func: train function.
-            calib_func: calibration function.
-            calib_dataloader: calibration dataloader.
-        """
-        if eval_func is not None:
-            self._eval_func = eval_func
-        if train_func is not None:
-            self._train_func = train_func
-        if calib_func is not None:
-            self._calib_func = calib_func
-        if calib_dataloader is not None:
-            self._calib_dataloader = calib_dataloader
-
-        if self.quantizer is None:
-            self._provider = Provider[provider.upper()].value
-
-        if self._provider == Provider.INC.value:
-            return self._inc_quantize(quant_config=quant_config, provider=provider)
-        else:
-            assert False, "Unsupported provider:{}".format(self._provider)
-
-    def init_pruner(
-        self,
-        pruning_config = None,
-        provider: str = Provider.INC.value,
-    ):
-        """Init a Pruning object with config.
-
-        Args:
-            pruning_config: pruning config.
-            provider: define the pruning provider.
-        """
-        from neural_compressor.experimental import Pruning
-        self.pruning_config = pruning_config
-        self.metrics = self.pruning_config.metrics
-        self._provider = Provider[provider.upper()].value
-
-        assert isinstance(self.pruning_config, PruningConfig), \
-            "please pass a instance of PruningConfig to trainer.prune!"
-
-        pruner = Pruning(self.pruning_config.inc_config)
-        pruner.model = common.Model(self.model)
-
-        self.pruner = pruner
-        return pruner
-
-    def prune(
-        self,
-        pruning_config = None,
-        provider: str = Provider.INC.value,
-        eval_func: Optional[Callable] = None,
-        train_func: Optional[Callable] = None,
-    ):
-        """Do the pruning.
-
-        Args:
-            pruning_config: pruning config.
-            provider: define the pruning provider.
-            eval_func: evaluation function.
-            train_func: train function.
-        """
-        if self.pruner is None:
-            self.init_pruner(pruning_config=pruning_config, provider=provider)
-        if eval_func is not None:
-            self._eval_func = eval_func
-        if train_func is not None:
-            self._train_func = train_func
-
-        self.pruner.eval_func = self._eval_func
-
-        self.pruner.pruning_func = self._train_func
-
-        self.opt_model = self.pruner.fit()
-        self.save_model(self.output_dir)
-        stats, sparsity = self.opt_model.report_sparsity()
-        logger.info(stats)
-        logger.info(sparsity)
-
-        return self.opt_model.model
-
-    def init_distiller(
-        self,
-        distillation_config,
-        teacher_model,
-        provider: str = Provider.INC.value,
-    ):
-        """Init a Distillation object with config and the teacher model.
-
-        Args:
-            distillation_config: distillation config.
-            teacher_model: set the teacher model.
-            provider: define the distillation provider.
-        """
-        from neural_compressor.experimental import Distillation, common
-        assert isinstance(distillation_config, DistillationConfig), \
-            "please pass a instance of PruningConfig to trainer.prune!"
-        self.distillation_config = distillation_config
-        self._provider = Provider[provider.upper()].value
-        self.metrics = self.distillation_config.metrics
-        self.teacher_model = teacher_model
-
-        distiller = Distillation(self.distillation_config.inc_config)
-        distiller.model = common.Model(self.model)
-        distiller.teacher_model = common.Model(self.teacher_model)
-
-        self.distiller = distiller
-        return distiller
-
-    def distill(
-        self,
-        distillation_config,
-        teacher_model,
-        provider: str = Provider.INC.value,
-        eval_func: Optional[Callable] = None,
-        train_func: Optional[Callable] = None,
-    ):
-        """Do the distillation.
-
-        Args:
-            distillation_config: distillation config.
-            teacher_model: set the teacher model.
-            provider: define the distillation provider.
-            eval_func: evaluation function.
-            train_func: train function.
-        """
-        if self.distiller is None:
-            self.init_distiller(
-                distillation_config=distillation_config,
-                teacher_model=teacher_model,
-                provider=provider
-            )
-        if eval_func is not None:
-            self._eval_func = eval_func
-        if train_func is not None:
-            self._train_func = train_func
-
-        self.distiller.eval_func = self._eval_func
-        self.distiller.train_func = self._train_func
-        self.distiller.create_criterion()
-
-        self.opt_model = self.distiller.fit()
-        self.save_model(self.output_dir)
-        return self.opt_model.model
-
-    def _save_inc_int8(self, opt_model, output_dir):
-        """Save the optimized model in the output directory.
-
-        Args:
-            opt_model: optimized model.
-            output_dir: output path.
-        """
-        self.model.config.architectures = [self.model.__class__.__name__]
-        self.model.config.torch_dtype = "int8"
-        if isinstance(self.model.config, PretrainedConfig):
-            self.model.config.save_pretrained(output_dir)
-        weights_file = os.path.join(os.path.abspath(
-          os.path.expanduser(output_dir)), WEIGHTS_NAME)
-        torch.save(opt_model.quantized_state_dict(), weights_file)
-
-    def save_model(self, output_dir, tokenizer=None):
-        """Save the model and tokenizer in the output directory.
-
-        Args:
-            output_dir: the path to save config.json and pytorch_model.bin.
-            tokenizer (object, optional): the tokenizer object, use it if you want to
-                                          save tokenizer.json in output_dir. Defaults to None.
-        """
-        os.makedirs(shlex.quote(output_dir), exist_ok=True)
-        torch.save(self.opt_model.quantized_state_dict(), os.path.join(shlex.quote(output_dir), WEIGHTS_NAME))
-        if self.enable_inc_quant and self.opt_model:
-            self._save_inc_int8(self.opt_model, output_dir)
-        else:
-            self.model.save_pretrained(output_dir)
-            self.model.config.save_pretrained(output_dir)
-        if tokenizer:   # pragma: no cover
-            tokenizer.save_pretrained(output_dir)
-        logger.info("Optimized model and configure file have saved to {}".format(
-                    output_dir))
diff --git a/intel_extension_for_transformers/transformers/trainer.py b/intel_extension_for_transformers/transformers/trainer.py
index a2534977363..387a9466389 100644
--- a/intel_extension_for_transformers/transformers/trainer.py
+++ b/intel_extension_for_transformers/transformers/trainer.py
@@ -29,18 +29,15 @@
 from neural_compressor import __version__ as nc_version
 from neural_compressor.utils import logger
 from intel_extension_for_transformers.transformers import (
-    DistillationConfig,
     Provider,
-    PruningMode,
-    QuantizationConfig,
-    QuantizationMode,
-    PruningConfig,
     DynamicLengthConfig,
     BenchmarkConfig,
 )
 from neural_compressor.training import prepare_compression
 from neural_compressor.quantization import fit
 from neural_compressor.config import (
+    DistillationConfig,
+    WeightPruningConfig,
     PostTrainingQuantConfig,
     QuantizationAwareTrainingConfig,
 )
@@ -133,11 +130,7 @@ def __init__(self, *args, **kwargs):
         self._calib_dataloader = None
         self._resuming_checkpoint = None
         self.compression_ctrl = None
-        self.component = None
         self.enable_inc_quant = False
-        self.pruner = None
-        self.quantizer = None
-        self.distiller = None
         self.fp32_model = None
         self.opt_model = None
         # This flag is set for the engine in the export_to_int8_onnx API.
@@ -147,6 +140,7 @@ def __init__(self, *args, **kwargs):
         self.orchestrate_opt_pruning = False
         self.dynamic_config = None
         self.model_config = None
+        self.compression_manager = None
 
     @property
     def resuming_checkpoint(self):
@@ -244,7 +238,7 @@ def builtin_train_func(self, model):
         """
         self.model_wrapped = model
         self.model = model
-        train_result = self.train(component=self.component,
+        train_result = self.train(compression_manager=self.compression_manager,
                                   resume_from_checkpoint=self._resuming_checkpoint)
         metrics = train_result.metrics
         if not self.orchestrate_opt:
@@ -275,10 +269,11 @@ def _inc_quantize(
                                  eval_func=self._eval_func)
         else:
             compression_manager = prepare_compression(self.model, quant_config)
-            compression_manager.callbacks.on_train_begin()
-            self.train()
-            compression_manager.callbacks.on_train_end()
-            self.opt_model = compression_manager.model
+            self.compression_manager = compression_manager
+            self.compression_manager.callbacks.on_train_begin()
+            self._train_func(compression_manager.model._model)
+            self.compression_manager.callbacks.on_train_end()
+            self.opt_model = self.compression_manager.model
         self.enable_inc_quant = True
         self.save_model(self.args.output_dir)
         return self.opt_model.model
@@ -338,54 +333,9 @@ def _save_inc_int8(self, opt_model, output_dir):
             torch.save(opt_model.quantized_state_dict(), weights_file)
         logger.info("quantized model and configure file have saved to {}".format(output_dir))
 
-    def init_pruner(
-        self,
-        pruning_config=None,
-        provider: str = Provider.INC.value,
-    ):
-        """Initialize the pruner.
-
-        Args:
-            pruning_config: The path to the YAML configuration file or PruningConf class containing
-            accuracy goal, pruning objective and related dataloaders etc.
-            provider: The provider used to quantize.
-
-        Returns:
-            An objective of neural_compressor Pruning class.
-        """
-
-        from neural_compressor.experimental import Pruning
-        self.pruning_config = pruning_config
-        self.metrics = self.pruning_config.metrics
-        self._provider = Provider[provider.upper()].value
-
-        assert isinstance(self.pruning_config, PruningConfig), \
-            "please pass a instance of PruningConfig to trainer.prune!"
-
-        pruning_start_epoch, pruning_end_epoch = self.pruning_config.epoch_range
-
-        # pylint: disable=E1101
-        if pruning_start_epoch > self.args.num_train_epochs - 1:
-            logger.warning(f"Pruning end epoch {pruning_start_epoch} is higher than "
-                           f"the total number of training epoch "
-                           f"{self.args.num_train_epochs}. No pruning will be applied.")
-
-        # pylint: disable=E1101
-        if pruning_end_epoch > self.args.num_train_epochs - 1:
-            logger.warning(
-                f"Pruning end epoch {pruning_end_epoch} is higher than "
-                f"the total number of training epoch "
-                f"{self.args.num_train_epochs}. The target sparsity will not be reached.")
-
-        pruner = Pruning(self.pruning_config.inc_config)
-        pruner.model = self.model
-
-        self.pruner = pruner
-        return pruner
-
     def prune(
         self,
-        pruning_config=None,
+        pruning_config: Union[WeightPruningConfig] = None,
         provider: str = Provider.INC.value,
         eval_func: Optional[Callable] = None,
         train_func: Optional[Callable] = None,
@@ -402,72 +352,19 @@ def prune(
         Returns:
             An objective of neural_compressor Pruning class.
         """
-        if self.pruner is None:
-            self.init_pruner(pruning_config=pruning_config, provider=provider)
-        if eval_func is not None:
-            self._eval_func = eval_func
-        if train_func is not None:
-            self._train_func = train_func
-
-        if self._eval_func is not None:
-            self.pruner.eval_func = self._eval_func
-        else:
-            assert self.metrics is not None, "Please pass metrics to trainer.pruning.metrics!"
-            assert self.pruning_config.pruner_config[0].prune_type == PruningMode.BASICMAGNITUDE.value, \
-                "Please pass eval_func to trainer.eval_func"
-            self.pruner.eval_func = self.builtin_eval_func
-
-        if self._train_func is not None:
-            self.pruner.pruning_func = self._train_func
-        else:
-            assert self.pruning_config.pruner_config[0].prune_type == PruningMode.BASICMAGNITUDE.value, \
-                "Please pass train_func to trainer.train_func"
-            self.pruner.pruning_func = self.builtin_train_func
-
-        self.component = self.pruner
-        self.opt_model = self.pruner.fit()
-        stats, sparsity = self.opt_model.report_sparsity()
-        logger.info(stats)
-        logger.info(sparsity)
-
+        self._eval_func = self.builtin_eval_func if eval_func is None else eval_func
+        self._train_func = self.builtin_train_func if train_func is None else train_func
+        compression_manager = prepare_compression(model=self.model, confs=pruning_config)
+        self.compression_manager = compression_manager
+        self.compression_manager.callbacks.on_train_begin()
+        self._train_func(compression_manager.model._model)
+        self.compression_manager.callbacks.on_train_end()
+        self.opt_model = self.compression_manager.model
         return self.opt_model.model
 
-    def init_distiller(
-        self,
-        distillation_config,
-        teacher_model: Union[PreTrainedModel, torch.nn.Module],
-        provider: str = Provider.INC.value,
-    ):
-        """The main entry point of automatic distillation tuning.
-
-        Args:
-            quant_config: The path to the YAML configuration file or DistillationConfig class containing.
-            accuracy goal, distillation objective and related dataloaders etc.
-            teacher_model: The model(torch.nn.Module) transfers knowledge to a smaller model.
-            provider (str): The provider used to quantize.
-
-        Returns:
-            An objective of neural_compressor Distillation class.
-        """
-        from neural_compressor.experimental import Distillation
-        assert isinstance(distillation_config, DistillationConfig), \
-            "please pass a instance of PruningConfig to trainer.prune!"
-        self.distillation_config = distillation_config
-        self._provider = Provider[provider.upper()].value
-        self.metrics = self.distillation_config.metrics
-        self.teacher_model = teacher_model
-
-        distiller = Distillation(self.distillation_config.inc_config)
-        distiller.model = self.model
-        distiller.teacher_model = self.teacher_model
-
-        self.distiller = distiller
-        return distiller
-
     def distill(
         self,
-        distillation_config,
-        teacher_model: Union[PreTrainedModel, torch.nn.Module],
+        distillation_config: Union[DistillationConfig] = None,
         provider: str = Provider.INC.value,
         eval_func: Optional[Callable] = None,
         train_func: Optional[Callable] = None,
@@ -477,7 +374,6 @@ def distill(
         Args:
             quant_config: The path to the YAML configuration file or DistillationConfig class containing
             accuracy goal, distillation objective and related dataloaders etc.
-            teacher_model: The model(torch.nn.Module) transfers knowledge to a smaller model.
             provider (str): The provider used to quantize.
             eval_func (:obj:`Callable`, optional: The function to evaluate the model.
             train_func (:obj:`Callable`, optional: The function to train the model.
@@ -485,34 +381,25 @@ def distill(
         Returns:
             An objective of neural_compressor Distillation class.
         """
-        if self.distiller is None:
-            self.init_distiller(distillation_config=distillation_config,
-                                teacher_model=teacher_model,
-                                provider=provider)
-        if eval_func is not None:
-            self._eval_func = eval_func
-        if train_func is not None:
-            self._train_func = train_func
-
-        if self._eval_func is not None:
-            self.distiller.eval_func = self._eval_func
+        if distillation_config.teacher_model is not None:
+            self.teacher_model = distillation_config.teacher_model
         else:
-            assert self.metrics is not None, \
-                "Please pass metrics to trainer.distillation.metrics!"
-            self.distiller.eval_func = self.builtin_eval_func
-
-        self.distiller.train_func = \
-            self.builtin_train_func if self._train_func is None else self._train_func
-        self.distiller.create_criterion()
-        self.component = self.distiller
-        self.opt_model = self.distiller.fit()
+            assert False, "Please provide teacher model for DistillationConfig."
+        self._eval_func = self.builtin_eval_func if eval_func is None else eval_func
+        self._train_func = self.builtin_train_func if train_func is None else train_func
+        
+        compression_manager = prepare_compression(self.model, distillation_config)
+        self.compression_manager = compression_manager
+        self.compression_manager.callbacks.on_train_begin()
+        self._train_func(compression_manager.model._model)
+        self.compression_manager.callbacks.on_epoch_end()
+        self.opt_model = self.compression_manager.model
 
         return self.opt_model.model
 
     def orchestrate_optimizations(
         self,
         config_list,
-        teacher_model: Optional[Callable] = None,
         eval_func: Optional[Callable] = None,
         train_func: Optional[Callable] = None,
     ):
@@ -525,50 +412,503 @@ def orchestrate_optimizations(
             eval_func (:obj:`Callable`, optional): Evaluation function to evaluate the tuning objective.
             train_func (:obj:`Callable`, optional): Training function which will be combined with pruning.
         """
-        from intel_extension_for_transformers.transformers.optimizer import Orchestrate_optimizer
+        # from intel_extension_for_transformers.transformers.optimizer import Orchestrate_optimizer
         self.orchestrate_opt = True
+        for config in config_list:
+            if isinstance(config, DistillationConfig):
+                self.teacher_model = config.teacher_model
+                assert self.teacher_model is not None, "Distillation need teacher model, please provide."
         self._eval_func = self.builtin_eval_func if eval_func is None else eval_func
         self._train_func = self.builtin_train_func if train_func is None else train_func
-        components = self.create_optimizer_builtin(config_list, teacher_model)
-        self.orchestrate_optimizer = Orchestrate_optimizer(self.model, components, \
-                                     eval_func=self.eval_func, train_func=self.train_func, \
-                                     output_dir=self.args.output_dir)
-        self.component = self.orchestrate_optimizer.scheduler.components[0]
-        torch_model = self.orchestrate_optimizer.fit()
-        return torch_model
+        compression_manager = prepare_compression(model=self.model, confs=config_list)
+        self.compression_manager = compression_manager
+        self.compression_manager.callbacks.on_train_begin()
+        self._train_func(compression_manager.model._model)
+        self.compression_manager.callbacks.on_train_end()
+        self.opt_model = self.compression_manager.model
+        return self.opt_model.model
 
-    def create_optimizer_builtin(self, config_list, teacher_model=None):
-        """The function to create optimizer.
+    def train(
+        self,
+        compression_manager = None,
+        resume_from_checkpoint: Optional[Union[str, bool]] = None,
+        trial: Union["optuna.Trial", Dict[str, Any]] = None,
+        ignore_keys_for_eval: Optional[List[str]] = None,
+        **kwargs,
+    ):  # pragma: no cover
+        """The main entry point tor train model.
 
         Args:
-            config_list: The list of configs.
-            teacher_model (:obj:`Callable`, optional): The model(torch.nn.Module) transfers knowledge
-                to a smaller model.
+            compression_manager (:obj:`CompressionManager`, `optional`): handling the training process.
+            resume_from_checkpoint (:obj:`str` or :obj:`bool`, `optional`): If a :obj:`str`, local path
+                to a saved checkpoint as saved by a previous instance of :class:`~transformers.Trainer`.
+                If a :obj:`bool` and equals `True`, load the last checkpoint in `args.output_dir` as saved
+                by a previous instance of :class:`~transformers.Trainer`. If present, training will resume
+                from the model/optimizer/scheduler states loaded here.
+            trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): The trial run or the
+                hyperparameter dictionary for hyperparameter search.
+            ignore_keys_for_eval (:obj:`List[str]`, `optional`): A list of keys in the output of your model
+                (if it is a dictionary) that should be ignored when gathering predictions for evaluation
+                during the training.
+            kwargs: Additional keyword arguments used to hide deprecated arguments
         """
-        components = []
-        for config in config_list:
-            if isinstance(config, PostTrainingQuantConfig):
-                component = self.init_quantizer(config)
-                component.eval_func = self._eval_func
-                component.q_func = self._train_func
-                self.enable_inc_quant = True
-            elif isinstance(config, PruningConfig):
-                self.orchestrate_opt_pruning = True
-                component = self.init_pruner(config)
-                component.eval_func = self._eval_func
-                component.pruning_func = self._train_func
-            elif isinstance(config, DistillationConfig):
-                assert isinstance(teacher_model, torch.nn.Module), \
-                        "The teacher_model is needed for distiller"
-                component = self.init_distiller(config, teacher_model)
-                component.eval_func = self._eval_func
-                component.train_func = self._train_func
-                component.create_criterion()
-            else:  # pragma: no cover
-                assert False, "Orchestrate_optimizations config_list requires at least one" \
-                    "       `QuantizationConfig`, `PruningConfig` or `DistillationConfig` object"
-            components.append(component)
-        return components
+        resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint
+
+        # memory metrics - must set up as early as possible
+        # pylint: disable=E1101
+        self._memory_tracker.start()
+
+        # pylint: disable=E1101
+        args = self.args
+
+        self.is_in_train = True
+
+        self.compression_manager = compression_manager
+
+        # do_train is not a reliable argument, as it might not be set and .train() still called, so
+        # the following is a workaround:
+        if args.fp16_full_eval and not args.do_train:
+            self._move_model_to_device(self.model, args.device)
+
+        if "model_path" in kwargs:
+            resume_from_checkpoint = kwargs.pop("model_path")
+            warnings.warn(
+                "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` "
+                "instead.",
+                FutureWarning,
+            )
+        if len(kwargs) > 0:
+            raise TypeError(
+                f"train() received got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}."
+            )
+        # This might change the seed so needs to run first.
+        self._hp_search_setup(trial)
+
+        # Model re-init
+        model_reloaded = False
+        if self.model_init is not None:
+            # Seed must be set before instantiating the model when using model_init.
+            set_seed(args.seed)
+            self.model = self.call_model_init(trial)
+            model_reloaded = True
+            # Reinitializes optimizer and scheduler
+            self.optimizer, self.lr_scheduler = None, None
+
+        # Load potential model checkpoint
+        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
+            resume_from_checkpoint = get_last_checkpoint(args.output_dir)
+            if resume_from_checkpoint is None:
+                raise ValueError(
+                    f"No valid checkpoint found in output directory ({args.output_dir})")
+
+        if resume_from_checkpoint is not None:
+            if version.parse(__version__) < version.parse("4.19"):
+                if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
+                    raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+
+                logger.info(f"Loading model from {resume_from_checkpoint}).")
+
+                if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)):
+                    config = PretrainedConfig.from_json_file(
+                        os.path.join(resume_from_checkpoint, CONFIG_NAME))
+                    checkpoint_version = config.transformers_version
+                    if checkpoint_version is not None and checkpoint_version != __version__:
+                        logger.warn(
+                            f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
+                            f"Transformers but your current version is {__version__}. "
+                            "This is not recommended and could yield to errors or unwanted behaviors."
+                        )
+
+                # We load the model state dict on the CPU to avoid an OOM error.
+                state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME),
+                                        map_location="cpu")
+                # If the model is on the GPU, it still works!
+                self._load_state_dict_in_model(state_dict)
+
+                # release memory
+                del state_dict
+            else:
+                self._load_from_checkpoint(resume_from_checkpoint)
+
+        # If model was re-initialized, put it on the right device and update self.model_wrapped
+        if model_reloaded:
+            if self.place_model_on_device:
+                self._move_model_to_device(self.model, args.device)
+            self.model_wrapped = self.model
+
+        # Keeping track whether we can can len() on the dataset or not
+        train_dataset_is_sized = isinstance(self.train_dataset, collections.abc.Sized)
+
+        # Data loader and number of training steps
+        # pylint: disable=E1101
+        train_dataloader = self.get_train_dataloader()
+
+        # Setting up training control variables:
+        # number of training epochs: num_train_epochs
+        # number of training steps per epoch: num_update_steps_per_epoch
+        # total number of training steps to execute: max_steps
+        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        if train_dataset_is_sized:
+            num_update_steps_per_epoch = len(train_dataloader) // args.gradient_accumulation_steps
+            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+            if args.max_steps > 0:
+                max_steps = args.max_steps
+                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
+                    args.max_steps % num_update_steps_per_epoch > 0)
+                # May be slightly incorrect if the last batch in the training datalaoder has a smaller size but it's
+                # the best we can do.
+                num_train_samples = args.max_steps * total_train_batch_size
+            else:
+                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
+                num_train_epochs = math.ceil(args.num_train_epochs)
+                num_train_samples = len(self.train_dataset) * args.num_train_epochs
+        else:
+            # see __init__. max_steps is set when the dataset has no __len__
+            max_steps = args.max_steps
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_train_epochs = sys.maxsize
+            num_update_steps_per_epoch = max_steps
+            num_train_samples = args.max_steps * total_train_batch_size
+
+        # pylint: disable=E1101
+        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
+            if self.args.n_gpu > 1:
+                # nn.DataParallel(model) replicates the model, creating new variables and module
+                # references registered here no longer work on other gpus, breaking the module
+                raise ValueError("Currently --debug underflow_overflow is not supported under DP. "
+                                 "Please use DDP (torch.distributed.launch).")
+            else:
+                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
+
+        # delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled()
+
+        if not delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        self.state = TrainerState()
+        self.state.is_hyper_param_search = trial is not None
+
+        # Activate gradient checkpointing if needed
+        if args.gradient_checkpointing:
+            self.model.gradient_checkpointing_enable()
+
+        model = self._wrap_model(self.model_wrapped)
+
+        # for the rest of this function `model` is the outside model, whether it was wrapped or not
+        if model is not self.model:
+            self.model_wrapped = model
+
+        if delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # Check if saved optimizer or scheduler states exist
+        self._load_optimizer_and_scheduler(resume_from_checkpoint)
+
+        # important: at this point:
+        # self.model         is the Transformers Model
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
+
+        # Train!
+        num_examples = (self.num_examples(train_dataloader)
+                        if train_dataset_is_sized else total_train_batch_size * args.max_steps)
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Num Epochs = {num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+        logger.info(
+            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}"
+        )
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {max_steps}")
+
+        self.state.epoch = 0
+        start_time = time.time()
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        # Check if continuing training from a checkpoint
+        if resume_from_checkpoint is not None and os.path.isfile(
+                os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)):
+            self.state = TrainerState.load_from_json(
+                os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            if not args.ignore_data_skip:
+                steps_trained_in_current_epoch = self.state.global_step % (
+                    num_update_steps_per_epoch)
+                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
+            else:
+                steps_trained_in_current_epoch = 0
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info(f"  Continuing training from epoch {epochs_trained}")
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
+            if not args.ignore_data_skip:
+                logger.info(
+                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
+                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
+                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                )
+                if self.is_local_process_zero() and not args.disable_tqdm:
+                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+                    steps_trained_progress_bar.set_description("Skipping the first batches")
+
+        # Update the references
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+        self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None
+        if trial is not None:
+            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
+            self.state.trial_params = hp_params(assignments)
+        else:
+            self.state.trial_params = None
+        # This should be the same if the state has been saved but in case the training arguments changed, it's safer
+        # to set this after the load.
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        tr_loss = torch.tensor(0.0).to(args.device)
+        # _total_loss_scalar is updated every time .item() has to be called on tr_loss and stores the sum of all losses
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+        model.zero_grad()
+
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
+        if not args.ignore_data_skip:
+            for epoch in range(epochs_trained):
+                # We just need to begin an iteration to create the randomization of the sampler.
+                for _ in train_dataloader:
+                    break
+        if self.compression_manager is not None:
+            if self.teacher_model is not None:
+                self.teacher_model = self._wrap_model(
+                    self.teacher_model)
+            # compression_manager.pre_epoch_begin(self.calib_dataloader if self.calib_dataloader else None)
+        for epoch in range(epochs_trained, num_train_epochs):
+            if isinstance(train_dataloader, torch.utils.data.dataloader.DataLoader) and \
+              isinstance(train_dataloader.sampler, torch.utils.data.distributed.DistributedSampler):
+                train_dataloader.sampler.set_epoch(epoch)
+            elif isinstance(train_dataloader.dataset, IterableDatasetShard):
+                train_dataloader.dataset.set_epoch(epoch)
+
+            epoch_iterator = train_dataloader
+
+            # Reset the past mems state at the beginning of each epoch if necessary.
+            if args.past_index >= 0:
+                self._past = None
+
+            steps_in_epoch = (len(epoch_iterator) if train_dataset_is_sized else args.max_steps *
+                              args.gradient_accumulation_steps)
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+            if self.compression_manager is not None:
+                self.compression_manager.callbacks.on_epoch_begin(epoch)
+
+            self.in_training = True
+            for step, inputs in enumerate(epoch_iterator):
+
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    if steps_trained_progress_bar is not None:
+                        steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
+                    continue
+                elif steps_trained_progress_bar is not None:
+                    steps_trained_progress_bar.close()
+                    steps_trained_progress_bar = None
+
+                if step % args.gradient_accumulation_steps == 0:
+                    self.control = self.callback_handler.on_step_begin(
+                        args, self.state, self.control)
+                    if compression_manager is not None:
+                        self.compression_manager.callbacks.on_step_begin(step)
+
+                training_step = self.training_step_length_adaptive if self.dynamic_config is not None and \
+                                    self.dynamic_config.dynamic_training else self.training_step
+                if (
+                    ((step + 1) % args.gradient_accumulation_steps != 0)
+                    and args.local_rank != -1
+                    and args._no_sync_in_gradient_accumulation
+                ):
+                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
+                    with model.no_sync():
+                        tr_loss_step = training_step(model, inputs)
+                else:
+                    tr_loss_step = training_step(model, inputs)
+
+                if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step)
+                                                    or torch.isinf(tr_loss_step)):
+                    # if loss is nan or inf simply add the average of previous logged losses
+                    tr_loss += tr_loss / (1 + self.state.global_step -
+                                          self._globalstep_last_logged)
+                else:
+                    tr_loss += tr_loss_step
+
+                self.current_flos += float(self.floating_point_ops(inputs))
+
+                if (step + 1) % args.gradient_accumulation_steps == 0 or (
+                        # last step in epoch but step is always smaller than gradient_accumulation_steps
+                        steps_in_epoch <= args.gradient_accumulation_steps and
+                    (step + 1) == steps_in_epoch):
+                    # if isinstance(component, Component):
+                    #     component.on_post_grad()
+
+                    # Gradient clipping
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
+
+                        if hasattr(self.optimizer, "clip_grad_norm"):
+                            # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping
+                            self.optimizer.clip_grad_norm(args.max_grad_norm)
+                        elif hasattr(model, "clip_grad_norm_"):
+                            # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
+                            model.clip_grad_norm_(args.max_grad_norm)
+                        else:
+                            # Revert to normal clipping otherwise, handling Apex or full precision
+                            torch.nn.utils.clip_grad_norm_(
+                                model.parameters(),
+                                args.max_grad_norm,
+                            )
+
+                    # # Optimizer step
+                    # if self.compression_ctrl is not None:
+                    #     self.compression_ctrl.scheduler.step()
+                    if self.compression_manager is not None:
+                        self.compression_manager.callbacks.on_before_optimizer_step()
+                    optimizer_was_run = True
+                    self.optimizer.step()
+                    if self.compression_manager is not None:
+                        self.compression_manager.callbacks.on_after_optimizer_step()
+
+                    if optimizer_was_run:
+                        self.lr_scheduler.step()
+
+                    model.zero_grad()
+                    self.state.global_step += 1
+                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.state.curr_loss = tr_loss_step.cpu().detach().item()
+                    self.control = self.callback_handler.on_step_end(args, self.state,
+                                                                     self.control)
+
+                    if self.compression_manager is not None:
+                         compression_manager.callbacks.on_step_end()
+                    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch,
+                                                  ignore_keys_for_eval)
+                else:
+                    self.control = self.callback_handler.on_substep_end(
+                        args, self.state, self.control)
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+
+            self.in_training = False
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            if self.compression_manager is not None:
+                self.compression_manager.callbacks.on_epoch_end()
+            self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
+
+            # pylint: disable=E1101
+            if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
+                logger.warning(
+                    "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
+                    "configured. Check your training configuration if this is unexpected.")
+
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info(
+            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
+        )
+        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
+            # Wait for everyone to get here so we are sur the model has been saved by process 0.
+            if args.local_rank != -1 and args.n_gpu > 1:
+                torch.distributed.barrier()
+
+            if version.parse(__version__) < version.parse("4.19"):
+                logger.info(
+                    f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
+                )
+
+                best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
+                if os.path.exists(best_model_path):
+                    # We load the model state dict on the CPU to avoid an OOM error.
+                    state_dict = torch.load(best_model_path, map_location="cpu")
+                    # If the model is on the GPU, it still works!
+                    self._load_state_dict_in_model(state_dict)
+                else:
+                    logger.warn(f"Could not locate the best model at {best_model_path}, "
+                                "if you are running a distributed training on multiple nodes, "
+                                "you should activate `--save_on_each_node`.")
+            else:
+                self._load_best_model()
+
+        # add remaining tr_loss
+        self._total_loss_scalar += tr_loss.item()
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics("train",
+                                start_time,
+                                num_samples=num_train_samples,
+                                num_steps=self.state.max_steps)
+        self.store_flos()
+        metrics["total_flos"] = self.state.total_flos
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self._memory_tracker.stop_and_update_metrics(metrics)
+
+        self.log(metrics)
+
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
+
+    # pylint: disable=E1101
+    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch,
+                                 ignore_keys_for_eval):  # pragma: no cover
+        if self.control.should_log:
+            if is_torch_tpu_available():
+                xm.mark_step()
+
+            logs: Dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+
+            # reset tr_loss to zero
+            tr_loss -= tr_loss
+
+            logs["loss"] = round(
+                tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+            logs["learning_rate"] = self._get_learning_rate()
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+            self.store_flos()
+
+            self.log(logs)
+
+        metrics = None
+        if self.control.should_evaluate:
+            metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
+            self._report_to_hp_search(trial, epoch, metrics)
+
+        if self.control.should_save:
+            self._save_checkpoint(model, trial, metrics=metrics)
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
 
     # pylint: disable=E1101
     def training_step(
@@ -656,7 +996,6 @@ def training_step(
 
         return loss.detach()
 
-
     def training_step_length_adaptive(
         self,
         model: torch.nn.Module,
@@ -853,11 +1192,10 @@ def compute_loss(self, model, inputs, return_outputs=False):  # pragma: no cover
             if self.label_smoother is not None and "labels" in inputs else None
 
         teacher_logits = inputs.pop("teacher_logits") if "teacher_logits" in inputs else None
-
         outputs = model(**inputs)
 
-        if self.in_training and hasattr(self, "component") and \
-           hasattr(self.component, "criterion"):
+        if self.in_training and hasattr(self, "compression_manager") and \
+           hasattr(self.compression_manager, "criterion"):
             qa_output_merger = lambda outputs: torch.vstack([
                 torch.vstack([sl, el])
                 for sl, el in zip(outputs["start_logits"], outputs["end_logits"])
@@ -884,8 +1222,8 @@ def get_logits(outputs):
                 if "start_positions" in inputs and "end_positions" in inputs:  # for SQuAD
                     teacher_logits = torch.vstack(list(teacher_logits))
             else:
-                teacher_outputs = self.component.criterion.teacher_model_forward(inputs)
-                teacher_logits = get_logits(self.component.criterion.teacher_outputs
+                teacher_outputs = self.compression_manager.criterion.teacher_model_forward(inputs)
+                teacher_logits = get_logits(self.compression_manager.criterion.teacher_outputs
                                             if teacher_outputs is None else teacher_outputs)
 
             logits = get_logits(outputs)
@@ -899,14 +1237,14 @@ def get_logits(outputs):
                     else:
                         raise AssertionError(
                             "Labels of input data not provided, can't compute loss")
-                if hasattr(self.component, "on_post_forward"):
-                    self.component.on_post_forward(inputs, teacher_output=teacher_logits)
-                    if hasattr(self.component.criterion, "teacher_outputs"):
-                        self.component.criterion.teacher_outputs = \
-                            get_logits(self.component.criterion.teacher_outputs)
-                loss = self.component.criterion(logits, labels)
-                if hasattr(self.component.criterion, 'add_origin_loss') and \
-                    self.component.criterion.add_origin_loss:
+                if hasattr(self.compression_manager, "on_post_forward"):
+                    self.compression_manager.on_post_forward(inputs, teacher_output=teacher_logits)
+                    if hasattr(self.compression_manager.criterion, "teacher_outputs"):
+                        self.compression_manager.criterion.teacher_outputs = \
+                            get_logits(self.compression_manager.criterion.teacher_outputs)
+                loss = self.compression_manager.criterion(logits, labels)
+                if hasattr(self.compression_manager.criterion, 'add_origin_loss') and \
+                    self.compression_manager.criterion.add_origin_loss:
                     loss = loss + outputs['loss']
             else:
                 if self.args.past_index >= 0:
@@ -917,7 +1255,8 @@ def get_logits(outputs):
                 else:
                     # We don't use .loss here since the model may return tuples instead of ModelOutput.
                     loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
-                loss = self.component.on_after_compute_loss(inputs, logits, loss, teacher_logits)
+                if self.compression_manager is not None:
+                    loss = self.compression_manager.on_after_compute_loss(inputs, logits, loss, teacher_logits)
             if "start_positions" in inputs and "end_positions" in inputs:
                 start_logits, end_logits = qa_output_spliter(logits)
                 outputs = {"start_logits": start_logits, "end_logits": end_logits, "loss": loss}
diff --git a/tests/Nightly/test_distillation.py b/tests/Nightly/test_distillation.py
index 118c7bb4444..cce4410f09d 100644
--- a/tests/Nightly/test_distillation.py
+++ b/tests/Nightly/test_distillation.py
@@ -21,14 +21,15 @@
 import unittest
 from datasets import load_dataset, load_metric
 from intel_extension_for_transformers.transformers import (
-    DistillationConfig,
-    DistillationCriterionMode,
     metrics,
     OptimizedModel,
-    NoTrainerOptimizer
 )
+
+from neural_compressor.config import (
+    DistillationConfig,
+    KnowledgeDistillationLossConfig,
+) 
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
-from intel_extension_for_transformers.transformers.distillation import Criterion
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -48,7 +49,6 @@ def setUpClass(self):
         self.teacher_model = AutoModelForSequenceClassification.from_pretrained(
             'distilbert-base-uncased-finetuned-sst-2-english'
         )
-        self.optimizer = NoTrainerOptimizer(self.model)
         raw_datasets = load_dataset("glue", "sst2")["validation"]
         tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
         def preprocess_function(examples):
@@ -76,38 +76,32 @@ def compute_metrics(p):
             preds = np.argmax(preds, axis=1)
             return metric.compute(predictions=preds, references=p.label_ids)
         origin_weight = copy.deepcopy(self.model.classifier.weight)
-        for mode in DistillationCriterionMode:
-            print("Distillation approach:", mode.value)
-            self.trainer = NLPTrainer(
-                model=copy.deepcopy(self.model),
-                train_dataset=self.dataset,
-                eval_dataset=self.dataset,
-                compute_metrics=compute_metrics,
-            )
-            metric_ = metrics.Metric(name="eval_accuracy")
-            criterion = Criterion(
-                name='IntermediateLayersLoss',
-                layer_mappings=[['classifier', 'classifier']],
-                loss_types=['MSE'],
-                loss_weight_ratio=[1.0],
-                add_origin_loss=False
-            ) if mode.value == "IntermediateLayersKnowledgeDistillationLoss" else None
-            distillation_conf = DistillationConfig(metrics=metric_, criterion=criterion)
-            distilled_model = self.trainer.distill(
-                distillation_config=distillation_conf, teacher_model=self.teacher_model
-            )
-            # By default, model will be saved in tmp_trainer dir.
-            self.trainer.save_model('./distilled_model')
-            loaded_model = OptimizedModel.from_pretrained(
-                './distilled_model',
-            )
-            distilled_weight = copy.deepcopy(distilled_model.classifier.weight)
-            loaded_weight = copy.deepcopy(loaded_model.classifier.weight)
-            # check distilled model
-            self.assertTrue((distilled_weight != origin_weight).any())
-            # check loaded model
-            self.assertTrue((distilled_weight == loaded_weight).all())
-            mlflow.end_run()
+  
+        self.trainer = NLPTrainer(
+            model=copy.deepcopy(self.model),
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            compute_metrics=compute_metrics,
+        )
+        metric_ = metrics.Metric(name="eval_accuracy")
+        self.trainer.metrics = metric_
+        distillation_criterion_conf = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"])
+        distillation_conf = DistillationConfig(self.teacher_model, distillation_criterion_conf)
+        distilled_model = self.trainer.distill(
+            distillation_config=distillation_conf
+        )
+        # By default, model will be saved in tmp_trainer dir.
+        self.trainer.save_model('./distilled_model')
+        loaded_model = OptimizedModel.from_pretrained(
+            './distilled_model',
+        )
+        distilled_weight = copy.deepcopy(distilled_model.classifier.weight)
+        loaded_weight = copy.deepcopy(loaded_model.classifier.weight)
+        # check distilled model
+        self.assertTrue((distilled_weight != origin_weight).any())
+        # check loaded model
+        self.assertTrue((distilled_weight == loaded_weight).all())
+        mlflow.end_run()
 
     def test_functional_distil(self):
         def eval_func(model):
@@ -118,27 +112,12 @@ def train_func(model):
 
         self.trainer = NLPTrainer(self.model)
 
-        distillation_conf = DistillationConfig()
+        distillation_conf = DistillationConfig(teacher_model=self.teacher_model)
         self.trainer.distill(distillation_conf,
-                           teacher_model=self.teacher_model,
                            provider="inc",
                            train_func = train_func,
                            eval_func = eval_func,)
 
-    def test_no_trainer_distill(self):
-        def eval_func(model):
-            return 1
-
-        def train_func(model):
-            return model
 
-        distillation_conf = DistillationConfig()
-        self.optimizer.eval_func = eval_func
-        self.optimizer.train_func = train_func
-        self.optimizer.distill(distillation_conf,
-                           teacher_model=self.teacher_model,
-                           provider="inc",
-                           train_func = train_func,
-                           eval_func = eval_func,)
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/Nightly/test_orchestrate_optimization.py b/tests/Nightly/test_orchestrate_optimization.py
index 422b10700a9..7137ccc844d 100644
--- a/tests/Nightly/test_orchestrate_optimization.py
+++ b/tests/Nightly/test_orchestrate_optimization.py
@@ -20,18 +20,14 @@
 import torch.utils.data as data
 import unittest
 from datasets import load_dataset, load_metric
-from intel_extension_for_transformers.transformers import (
-    PrunerConfig,
-    PruningConfig,
+from neural_compressor.config import (
+    WeightPruningConfig,
     DistillationConfig,
-    QuantizationConfig,
-    DistillationCriterionMode,
-    metrics,
-    objectives,
-    OptimizedModel,
+    KnowledgeDistillationLossConfig,
+    QuantizationAwareTrainingConfig,
 )
+from intel_extension_for_transformers.transformers import metrics
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
-from intel_extension_for_transformers.transformers.distillation import Criterion
 
 from transformers import (
     AutoModelForSequenceClassification,
@@ -77,36 +73,26 @@ def compute_metrics(p):
             preds = p.predictions
             preds = np.argmax(preds, axis=1)
             return metric.compute(predictions=preds, references=p.label_ids)
-        origin_weight = copy.deepcopy(self.model.classifier.weight)
-        for mode in DistillationCriterionMode:
-            print("Distillation approach:", mode.value)
-            self.trainer = NLPTrainer(
-                model=copy.deepcopy(self.model),
-                train_dataset=self.dataset,
-                eval_dataset=self.dataset,
-                compute_metrics=compute_metrics,
-            )
-            self.trainer.calib_dataloader = self.trainer.get_eval_dataloader()
+
+        self.trainer = NLPTrainer(
+            model=copy.deepcopy(self.model),
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            compute_metrics=compute_metrics,
+        )
+        self.trainer.calib_dataloader = self.trainer.get_eval_dataloader()
         tune_metric = metrics.Metric(
             name="eval_accuracy", is_relative=True, criterion=0.5
         )
-        pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=0.9)
-        pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric)
-        distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric)
-
-        objective = objectives.performance
-        quantization_conf = QuantizationConfig(
-            approach="QuantizationAwareTraining",
-            max_trials=600,
-            metrics=[tune_metric],
-            objectives=[objective]
-        )
-
-        from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace
-        self.model = symbolic_trace(self.model, is_qat=True)
-        self.trainer.model = self.model
+        self.trainer.metrics = tune_metric
+        pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], 
+                                           target_sparsity=0.64, 
+                                           pruning_scope="local")
+        distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"])
+        distillation_conf = DistillationConfig(teacher_model=self.teacher_model, criterion=distillation_criterion)
+        quantization_conf = QuantizationAwareTrainingConfig()
         conf_list = [pruning_conf, distillation_conf, quantization_conf]
-        opt_model = self.trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=self.teacher_model)
+        opt_model = self.trainer.orchestrate_optimizations(config_list=conf_list)
         self.assertTrue("quantize" in str(type(opt_model.classifier.module)))
 
 
diff --git a/tests/Nightly/test_pruning.py b/tests/Nightly/test_pruning.py
index b7284ddfe6b..01c7045bf2d 100644
--- a/tests/Nightly/test_pruning.py
+++ b/tests/Nightly/test_pruning.py
@@ -20,11 +20,8 @@
 from intel_extension_for_transformers.transformers import (
     metrics,
     OptimizedModel,
-    PrunerConfig,
-    PruningConfig,
-    PruningMode,
-    NoTrainerOptimizer
 )
+from neural_compressor.config import WeightPruningConfig
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
 from transformers import (
     AutoModelForSequenceClassification,
@@ -63,7 +60,6 @@ def setUpClass(self):
             train_dataset=self.dummy_dataset,
             eval_dataset=self.dummy_dataset,
         )
-        self.optimizer = NoTrainerOptimizer(self.model)
 
     @classmethod
     def tearDownClass(self):
@@ -72,31 +68,29 @@ def tearDownClass(self):
 
     def test_fx_model_prune(self):
         origin_weight = copy.deepcopy(self.model.classifier.weight)
-        for mode in PruningMode:
-            # not supported yet
-            if mode.name != "BasicMagnitude".upper():
-                continue
-            self.trainer = NLPTrainer(
-                model=self.model,
-                train_dataset=self.dummy_dataset,
-                eval_dataset=self.dummy_dataset,
-            )
-            metric = metrics.Metric(name="eval_loss")
-            pruner_config = PrunerConfig(prune_type=mode.name, target_sparsity_ratio=0.9)
-            pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=metric)
-            agent = self.trainer.init_pruner(pruning_config=pruning_conf)
-            pruned_model = self.trainer.prune()
-            # By default, model will be saved in tmp_trainer dir.
-            self.trainer.save_model('./pruned_model')
-            loaded_model = OptimizedModel.from_pretrained(
-                './pruned_model',
-            )
-            pruned_weight = copy.deepcopy(pruned_model.classifier.weight)
-            loaded_weight = copy.deepcopy(loaded_model.classifier.weight)
-            # check pruned model
-            self.assertTrue((pruned_weight != origin_weight).any())
-            # check loaded model
-            self.assertTrue((pruned_weight == loaded_weight).all())
+
+        self.trainer = NLPTrainer(
+            model=self.model,
+            train_dataset=self.dummy_dataset,
+            eval_dataset=self.dummy_dataset,
+        )
+        metric = metrics.Metric(name="eval_loss")
+        self.trainer.metrics = metric
+        pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}],
+                                            target_sparsity=0.64,
+                                            pruning_scope="local")
+        pruned_model = self.trainer.prune(pruning_config=pruning_conf)
+        # By default, model will be saved in tmp_trainer dir.
+        self.trainer.save_model('./pruned_model')
+        loaded_model = OptimizedModel.from_pretrained(
+            './pruned_model',
+        )
+        pruned_weight = copy.deepcopy(pruned_model.classifier.weight)
+        loaded_weight = copy.deepcopy(loaded_model.classifier.weight)
+        # check pruned model
+        self.assertTrue((pruned_weight != origin_weight).any())
+        # check loaded model
+        self.assertTrue((pruned_weight == loaded_weight).all())
 
     def test_functional_prune(self):
         def eval_func(model):
@@ -106,27 +100,14 @@ def train_func(model):
             return model
 
         self.trainer = NLPTrainer(self.model)
-        pruner_conf = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9)
-        pruning_conf = PruningConfig(pruner_config=pruner_conf)
+        pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}],
+                                            target_sparsity=0.64,
+                                            pruning_scope="local")
         self.trainer.prune(pruning_conf,
                            provider="inc",
                            train_func = train_func,
                            eval_func = eval_func,)
 
-    def test_no_trainer_prune(self):
-        def eval_func(model):
-            return 1
-
-        def train_func(model):
-            return model
 
-        pruner_conf = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9)
-        pruning_conf = PruningConfig(pruner_config=pruner_conf)
-        self.optimizer.eval_func = eval_func
-        self.optimizer.train_func = train_func
-        self.optimizer.prune(pruning_conf,
-                           provider="inc",
-                           train_func = train_func,
-                           eval_func = eval_func,)
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/Nightly/test_tf_distillation.py b/tests/Nightly/test_tf_distillation.py
deleted file mode 100644
index d5521845439..00000000000
--- a/tests/Nightly/test_tf_distillation.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import numpy as np
-import unittest
-import tensorflow as tf
-from datasets import load_dataset, load_metric
-from transformers import (TFAutoModelForSequenceClassification, AutoTokenizer,
-                          HfArgumentParser, TFTrainingArguments, set_seed,
-                          DefaultDataCollator)
-from intel_extension_for_transformers.transformers import (DistillationConfig, metrics)
-from intel_extension_for_transformers.transformers.distillation import Criterion
-from intel_extension_for_transformers.transformers.optimizer_tf import TFOptimization
-
-
-class TestDistillation(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        set_seed(42)
-        self.model = TFAutoModelForSequenceClassification.from_pretrained(
-            'hf-internal-testing/tiny-random-distilbert')
-        self.teacher_model = TFAutoModelForSequenceClassification.from_pretrained(
-            'hf-internal-testing/tiny-random-DistilBertForSequenceClassification')
-
-        raw_datasets = load_dataset("glue", "sst2")["validation"]
-        self.tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification")
-        non_label_column_names = [
-            name for name in raw_datasets.column_names if name != "label"
-        ]
-
-        def preprocess_function(examples):
-            # Tokenize the texts
-            args = ((examples['sentence'], ))
-            result = self.tokenizer(*args,
-                                    padding=True,
-                                    max_length=64,
-                                    truncation=True)
-            return result
-
-        raw_datasets = raw_datasets.map(preprocess_function,
-                                        batched=True,
-                                        load_from_cache_file=False)
-        data_collator = DefaultDataCollator(return_tensors="tf")
-        dataset = raw_datasets.select(range(10))
-        self.dummy_dataset = dataset.to_tf_dataset(
-            columns=[
-                col for col in dataset.column_names
-                if col not in set(non_label_column_names + ["label"])
-            ],
-            shuffle=False,
-            batch_size=2,
-            collate_fn=data_collator,
-            drop_remainder=False,
-            # `label_cols` is needed for user-defined losses, such as in this example
-            # datasets v2.3.x need "labels", not "label"
-            label_cols=["labels"]
-            if "label" in dataset.column_names else None,
-        )
-        parser = HfArgumentParser(TFTrainingArguments)
-        self.args = parser.parse_args_into_dataclasses(args=[
-            "--output_dir", "./distilled_model",
-            "--per_device_eval_batch_size", "2"
-        ])[0]
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=self.args.learning_rate,
-            beta_1=self.args.adam_beta1,
-            beta_2=self.args.adam_beta2,
-            epsilon=self.args.adam_epsilon,
-            clipnorm=self.args.max_grad_norm,
-        )
-        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-            from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
-        metrics = ["accuracy"]
-        self.model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree('./tmp', ignore_errors=True)
-        shutil.rmtree('./distilled_model', ignore_errors=True)
-
-    def test_tf_model_distil(self):
-        metric = load_metric("glue", "sst2")
-        def compute_metrics(preds, label_ids):
-            preds = preds["logits"]
-            preds = np.argmax(preds, axis=1)
-            result = metric.compute(predictions=preds, references=label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-
-        self.optimizer = TFOptimization(model=self.model,
-                                        args=self.args,
-                                        train_dataset=self.dummy_dataset,
-                                        compute_metrics=compute_metrics)
-        metric_ = metrics.Metric(name="eval_accuracy")
-        # 'CrossEntropyLoss', 'SparseCategoricalCrossentropy', 'KnowledgeDistillationLoss'
-        criterion = Criterion(name='KnowledgeLoss',
-                              layer_mappings=[['classifier', 'classifier']],
-                              loss_types=['CE', 'CE'],
-                              loss_weight_ratio=[0.5, 0.5],
-                              add_origin_loss=False)
-        distillation_conf = DistillationConfig(metrics=metric_,
-                                               criterion=criterion)
-        def eval_func(model):
-            return 1
-        distilled_model = self.optimizer.distill(
-            distillation_config=distillation_conf,
-            teacher_model=self.teacher_model,
-            eval_func=eval_func,
-            train_func=self.optimizer.build_train_func
-        )
-        distilled_model2 = self.optimizer.distill(
-            distillation_config=distillation_conf,
-            teacher_model=self.teacher_model,
-            eval_func=None,
-            train_func=None
-        )
-        self.assertEqual(distilled_model.signatures['serving_default'].output_shapes['Identity'], distilled_model2.signatures['serving_default'].output_shapes['Identity'])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/Nightly/test_tf_pruning.py b/tests/Nightly/test_tf_pruning.py
deleted file mode 100644
index 5fa4806957a..00000000000
--- a/tests/Nightly/test_tf_pruning.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-import numpy as np
-import os
-import shutil
-import tensorflow as tf
-import unittest
-from datasets import load_dataset, load_metric
-from intel_extension_for_transformers.transformers import (
-    metrics,
-    PrunerConfig,
-    PruningConfig,
-    TFOptimization
-)
-from transformers import (
-    AutoTokenizer,
-    DefaultDataCollator,
-    HfArgumentParser,
-    TFAutoModelForSequenceClassification,
-    TFTrainingArguments,
-)
-
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestTFPruning(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model = TFAutoModelForSequenceClassification.from_pretrained(
-            'hf-internal-testing/tiny-random-DistilBertForSequenceClassification'
-        )
-        raw_datasets = load_dataset("glue", "sst2")["validation"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification")
-        non_label_column_names = [name for name in raw_datasets.column_names if name != "label"]
-        def preprocess_function(examples):
-            # Tokenize the texts
-            args = (
-                (examples["sentence"],)
-            )
-            result = tokenizer(*args, padding=True, max_length=64, truncation=True)
-
-            return result
-        raw_datasets = raw_datasets.map(preprocess_function, batched=True, load_from_cache_file=False)
-        data_collator = DefaultDataCollator(return_tensors="tf")
-        dataset = raw_datasets.select(range(10))
-        self.dummy_dataset = dataset.to_tf_dataset(
-            columns=[col for col in dataset.column_names if col not in
-                     set(non_label_column_names + ["label"])],
-            shuffle=False,
-            batch_size=2,
-            collate_fn=data_collator,
-            drop_remainder=False,
-            # `label_cols` is needed for user-defined losses, such as in this example
-            # datasets v2.3.x need "labels", not "label"
-            label_cols=["labels"] if "label" in dataset.column_names else None,
-        )
-        parser = HfArgumentParser(TFTrainingArguments)
-        self.args = parser.parse_args_into_dataclasses(args=["--output_dir", "./quantized_model",
-                                                        "--per_device_eval_batch_size", "2"])[0]
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=self.args.learning_rate,
-            beta_1=self.args.adam_beta1,
-            beta_2=self.args.adam_beta2,
-            epsilon=self.args.adam_epsilon,
-            clipnorm=self.args.max_grad_norm,
-        )
-        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-            from_logits=True, reduction=tf.keras.losses.Reduction.SUM
-        )
-        metrics = ["accuracy"]
-        self.model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree('./tmp', ignore_errors=True)
-        shutil.rmtree('./quantized_model', ignore_errors=True)
-
-    def test_tf_model_quant(self):
-        # check whether it is possible to set distributed environment
-        # only for coverage currently
-        from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init
-        distributed_init(["localhost:12345","localhost:23456"], "worker", 0)
-        self.assertTrue(os.environ['TF_CONFIG'] != None)
-        del os.environ['TF_CONFIG']
-        # check whether filepath can be set correctly if using distributed environment
-        # only for coverage currently
-        from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath
-        self.assertTrue(type(get_filepath("dummy", "worker", 0)) == str)
-        self.assertTrue(type(get_filepath("dummy", "worker", 1)) == str)
-        self.assertTrue(get_filepath("dummy", "worker", 0) != get_filepath("dummy", "worker", 1))
-
-        metric = load_metric("glue", "sst2")
-        def compute_metrics(preds, label_ids):
-            preds = preds["logits"]
-            preds = np.argmax(preds, axis=1)
-            result = metric.compute(predictions=preds, references=label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        self.optimizer = TFOptimization(
-            model=self.model,
-            args=self.args,
-            train_dataset=self.dummy_dataset,
-            eval_dataset=self.dummy_dataset,
-            compute_metrics=compute_metrics,
-        )
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=True, criterion=0.01,
-        )
-        prune_type = 'BasicMagnitude'
-        target_sparsity_ratio = 0.1
-        pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)
-        pruning_conf = PruningConfig(
-            epochs=int(1), pruner_config=pruner_config, metrics=tune_metric
-        )
-        p_model = self.optimizer.prune(pruning_config=pruning_conf)
-        loaded_model = tf.saved_model.load(self.args.output_dir)
-        p_model = self.optimizer.prune(pruning_config=pruning_conf,
-                                train_dataset=self.dummy_dataset,
-                                eval_dataset=self.dummy_dataset,)
-
-        def eval_func(model):
-            return 1
-
-        def train_func(model):
-            return model
-
-        self.optimizer.prune(pruning_config=pruning_conf,
-                             train_func=train_func,
-                             eval_func=eval_func)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/Nightly/test_tf_quantization.py b/tests/Nightly/test_tf_quantization.py
deleted file mode 100644
index 3162950c68a..00000000000
--- a/tests/Nightly/test_tf_quantization.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import shutil
-import tensorflow as tf
-import unittest
-from datasets import load_dataset, load_metric
-from intel_extension_for_transformers.transformers import (
-    metrics,
-    objectives,
-    QuantizationConfig,
-    TFOptimization
-)
-# from intel_extension_for_transformers.transformers import metrics, objectives
-from transformers import (
-    AutoTokenizer,
-    DefaultDataCollator,
-    HfArgumentParser,
-    TFAutoModelForSequenceClassification,
-    TFTrainingArguments,
-)
-
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestTFQuantization(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model = TFAutoModelForSequenceClassification.from_pretrained(
-            'hf-internal-testing/tiny-random-DistilBertForSequenceClassification'
-        )
-        raw_datasets = load_dataset("glue", "sst2")["validation"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification")
-        non_label_column_names = [name for name in raw_datasets.column_names if name != "label"]
-        def preprocess_function(examples):
-            # Tokenize the texts
-            args = (
-                (examples["sentence"],)
-            )
-            result = tokenizer(*args, padding=True, max_length=64, truncation=True)
-
-            return result
-        raw_datasets = raw_datasets.map(preprocess_function, batched=True, load_from_cache_file=False)
-        data_collator = DefaultDataCollator(return_tensors="tf")
-        dataset = raw_datasets.select(range(10))
-        self.dummy_dataset = dataset.to_tf_dataset(
-            columns=[col for col in dataset.column_names if col not in
-                     set(non_label_column_names + ["label"])],
-            shuffle=False,
-            batch_size=2,
-            collate_fn=data_collator,
-            drop_remainder=False,
-            # `label_cols` is needed for user-defined losses, such as in this example
-            # datasets v2.3.x need "labels", not "label"
-            label_cols=["labels"] if "label" in dataset.column_names else None,
-        )
-
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree('./tmp', ignore_errors=True)
-        shutil.rmtree('./quantized_model', ignore_errors=True)
-
-    def test_tf_model_quant(self):
-        parser = HfArgumentParser(TFTrainingArguments)
-        args = parser.parse_args_into_dataclasses(args=["--output_dir", "./quantized_model",
-                                                        "--per_device_eval_batch_size", "2"])
-        metric = load_metric("glue", "sst2")
-        def compute_metrics(preds, label_ids):
-            preds = preds["logits"]
-            preds = np.argmax(preds, axis=1)
-            result = metric.compute(predictions=preds, references=label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        self.optimizer = TFOptimization(
-            model=self.model,
-            args=args[0],
-            compute_metrics=compute_metrics
-        )
-        tune_metric = metrics.Metric(
-            name="accuracy", greater_is_better=True, is_relative=False, criterion=0.5
-        )
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance]
-        )
-        quantized_model = self.optimizer.quantize(quant_config=quantization_config,
-            train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset)
-        loaded_model = tf.saved_model.load(args[0].output_dir)
-
-        def eval_func(model):
-            return 1
-
-        def train_func(model):
-            return model
-
-        self.optimizer.quantize(quant_config=quantization_config,
-                                train_func=train_func,
-                                eval_func=eval_func)
-
-        quantization_config = QuantizationConfig(
-            framework="tensorflow",
-            approach="POSTTRAININGSTATIC",
-            metrics=[tune_metric],
-            objectives=[objectives.performance],
-            recipes={"first_conv_or_matmul_quantization": True,
-                     "last_conv_or_matmul_quantization": True,
-                     }
-        )
-        self.optimizer.quantize(quant_config=quantization_config,
-                                train_func=train_func,
-                                eval_func=eval_func)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/workflows/compression_aware_training/config/README.md b/workflows/compression_aware_training/config/README.md
index c72b4ba0af9..86e13e21260 100644
--- a/workflows/compression_aware_training/config/README.md
+++ b/workflows/compression_aware_training/config/README.md
@@ -23,7 +23,7 @@ output_dir:                 Path to output directory.
 overwrite_output_dir:       Whether to overwrite Output cache.
 perf_tol:                   Performance tolerance when optimizing the model.
 quantization:               Needs to be true in this case.
-quantization_approach:      Quantization approach. Supported approach are PostTrainingStatic, PostTrainingDynamic and QuantizationAwareTraining.
+quantization_approach:      Quantization approach. Supported approach are static, dynamic and qat.
 is_relative:                Metric tolerance model, expected to be relative or absolute.
 int8:                       Load int8 model.
 ```
@@ -41,7 +41,7 @@ output_dir:                 Path to output directory.
 overwrite_output_dir:       Whether to overwrite Output cache.
 perf_tol:                   Performance tolerance when optimizing the model.
 quantization:               Needs to be true in this case.
-quantization_approach:      Quantization approach. Supported approach are PostTrainingStatic, PostTrainingDynamic and QuantizationAwareTraining.
+quantization_approach:      Quantization approach. Supported approach are static, dynamic and qat.
 is_relative:                Metric tolerance model, expected to be relative or absolute.
 int8:                       Load int8 model.
 ```
diff --git a/workflows/compression_aware_training/config/config.yaml b/workflows/compression_aware_training/config/config.yaml
index 48e31757b6e..0bc18386cfe 100755
--- a/workflows/compression_aware_training/config/config.yaml
+++ b/workflows/compression_aware_training/config/config.yaml
@@ -25,6 +25,6 @@ overwrite_output_dir: true
 perf_tol: 0.03
 
 quantization: true
-quantization_approach: "QuantizationAwareTraining"
+quantization_approach: "qat"
 is_relative: true
 int8: false
diff --git a/workflows/compression_aware_training/config/distillation_with_qat.yaml b/workflows/compression_aware_training/config/distillation_with_qat.yaml
index 48e31757b6e..0bc18386cfe 100755
--- a/workflows/compression_aware_training/config/distillation_with_qat.yaml
+++ b/workflows/compression_aware_training/config/distillation_with_qat.yaml
@@ -25,6 +25,6 @@ overwrite_output_dir: true
 perf_tol: 0.03
 
 quantization: true
-quantization_approach: "QuantizationAwareTraining"
+quantization_approach: "qat"
 is_relative: true
 int8: false
diff --git a/workflows/compression_aware_training/config/qat.yaml b/workflows/compression_aware_training/config/qat.yaml
index faf0416ed2f..be783e839bf 100644
--- a/workflows/compression_aware_training/config/qat.yaml
+++ b/workflows/compression_aware_training/config/qat.yaml
@@ -24,6 +24,6 @@ overwrite_output_dir: true
 perf_tol: 0.03
 
 quantization: true
-quantization_approach: "QuantizationAwareTraining"
+quantization_approach: "qat"
 is_relative: true
 int8: false
diff --git a/workflows/compression_aware_training/config/sat.yaml b/workflows/compression_aware_training/config/sat.yaml
index 7731f0dfb69..439828b0f1f 100755
--- a/workflows/compression_aware_training/config/sat.yaml
+++ b/workflows/compression_aware_training/config/sat.yaml
@@ -16,7 +16,7 @@ model_name_or_path: "Intel/distilbert-base-uncased-sparse-90-unstructured-pruneo
 teacher_model_name_or_path: "distilbert-base-uncased-finetuned-sst-2-english"
 task_name: "sst2"
 sat: true
-quantization_approach: "QuantizationAwareTraining"
+quantization_approach: "qat"
 learning_rate: 0.000012
 num_train_epochs: 6
 do_train: true
diff --git a/workflows/compression_aware_training/src/itrex_opt.py b/workflows/compression_aware_training/src/itrex_opt.py
index b727d22c412..c1ba546cdc4 100755
--- a/workflows/compression_aware_training/src/itrex_opt.py
+++ b/workflows/compression_aware_training/src/itrex_opt.py
@@ -28,14 +28,19 @@
 
 # Need to use itrex domain toolkit
 from intel_extension_for_transformers.transformers import (
-    DistillationConfig,
-    PrunerConfig,
-    PruningConfig,
     OptimizedModel,
-    QuantizationConfig,
     metrics,
     objectives,
 )
+from neural_compressor.config import (
+    WeightPruningConfig,
+    DistillationConfig,
+    KnowledgeDistillationLossConfig,
+    QuantizationAwareTrainingConfig,
+    PostTrainingQuantConfig,
+    TuningCriterion,
+    AccuracyCriterion
+)
 from intel_extension_for_transformers.transformers.trainer import NLPTrainer
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
@@ -529,7 +534,7 @@ def compute_metrics(p: EvalPrediction):
 
         # Initialize and setup our itrexTrainer
         from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace
-        self.model = symbolic_trace(self.model, self.optim_args.quantization_approach=="QuantizationAwareTraining")
+        self.model = symbolic_trace(self.model, self.optim_args.quantization_approach=="qat")
 
         self.trainer = NLPTrainer(
             model=self.model,
@@ -746,30 +751,38 @@ def _do_quantization_aware_training(self):
             raise ValueError("do_eval must be set to True for quantization.")
 
         self.trainer.save_model(self.training_args.output_dir)
-        if self.optim_args.quantization_approach != "PostTrainingDynamic":
+        if self.optim_args.quantization_approach != "dynamic":
             if not self.training_args.do_train:
                 raise ValueError(
                     "do_train must be set to True for static and aware training quantization."
                 )
-            elif self.optim_args.quantization_approach == "QuantizationAwareTraining":
-                early_stopping_patience = 6
-                early_stopping_threshold = 0.001  # optional
-                # trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience,
-                #                                                    early_stopping_threshold))
 
         tune_metric = metrics.Metric(
-            name=metric_name,
-            is_relative=self.optim_args.is_relative,
-            criterion=self.optim_args.perf_tol,
+            name=metric_name, is_relative=self.optim_args.is_relative, criterion=self.optim_args.perf_tol
         )
+        self.trainer.metrics = tune_metric
         objective = objectives.performance
-        quantization_config = QuantizationConfig(
-            approach=self.optim_args.quantization_approach,
-            max_trials=600,
-            metrics=[tune_metric],
-            objectives=[objective],
-            sampling_size=len(self.train_dataset) // 20,
-        )
+        tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name])
+        accuracy_criterion = AccuracyCriterion(
+            higher_is_better=True,  # optional.
+            criterion="relative" if self.optim_args.is_relative else "absolute",  # optional. Available values are "relative" and "absolute".
+            tolerable_loss=self.optim_args.perf_tol,  # optional.
+        )
+        if self.optim_args.quantization_approach != "qat":
+            quantization_config = PostTrainingQuantConfig(
+                approach=self.optim_args.quantization_approach,
+                tuning_criterion=tuning_criterion,
+                accuracy_criterion=accuracy_criterion
+            )
+        else:
+            quantization_config = QuantizationAwareTrainingConfig(
+                tuning_criterion=tuning_criterion,
+                accuracy_criterion=accuracy_criterion
+            )            
+            early_stopping_patience = 2
+            early_stopping_threshold = 0.001 # optional
+            self.trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \
+                                                                    early_stopping_threshold))
         model = self.trainer.quantize(quant_config=quantization_config)
 
         if self.optim_args.benchmark or self.optim_args.accuracy_only:
@@ -939,23 +952,15 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset):
             tune_metric = metrics.Metric(
                 name=metric_name, is_relative=self.optim_args.is_relative, criterion=self.optim_args.perf_tol
             )
-            prune_type = 'PatternLock' \
-                if self.optim_args.pruning_approach else self.optim_args.pruning_approach
-            target_sparsity_ratio = self.optim_args.target_sparsity_ratio \
-                if self.optim_args.target_sparsity_ratio else None
-            pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)
-            pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric)
-            distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric)
-
-            objective = objectives.performance
-            quantization_conf = QuantizationConfig(
-                approach=self.optim_args.quantization_approach,
-                max_trials=600,
-                metrics=[tune_metric],
-                objectives=[objective]
-            )
+            self.trainer.metrics = tune_metric
+            pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}],
+                                                target_sparsity=self.optim_args.target_sparsity_ratio,
+                                                pruning_scope="local")
+            distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"])
+            distillation_conf = DistillationConfig(teacher_model=self.teacher_model, criterion=distillation_criterion)
+            quantization_conf = QuantizationAwareTrainingConfig()
             conf_list = [pruning_conf, distillation_conf, quantization_conf]
-            model = self.trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=self.teacher_model)
+            model = self.trainer.orchestrate_optimizations(config_list=conf_list)
 
         # ############################################################
         print(
diff --git a/workflows/compression_aware_training/src/utils.py b/workflows/compression_aware_training/src/utils.py
index 46467c2b6ab..2ce1a4c819e 100755
--- a/workflows/compression_aware_training/src/utils.py
+++ b/workflows/compression_aware_training/src/utils.py
@@ -187,7 +187,7 @@ class OptimizationArguments:
         metadata={"help": "Whether or not to apply prune."},
     )
     pruning_approach: Optional[str] = field(
-        default="BasicMagnitude",
+        default="magnitude",
         metadata={"help": "Pruning approach. Supported approach is basic_magnite."},
     )
     target_sparsity_ratio: Optional[float] = field(
@@ -207,9 +207,9 @@ class OptimizationArguments:
         metadata={"help": "Whether or not to apply quantization."},
     )
     quantization_approach: Optional[str] = field(
-        default="PostTrainingStatic",
-        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
-                  "PostTrainingDynamic and QuantizationAwareTraining."},
+        default="static",
+        metadata={"help": "Quantization approach. Supported approach are static, "
+                  "dynamic and qat."},
     )
     metric_name: Optional[str] = field(
         default=None,