diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
index eeb71beb604..41e22c470b9 100644
--- a/.github/workflows/script/formatScan/pylint.sh
+++ b/.github/workflows/script/formatScan/pylint.sh
@@ -28,7 +28,7 @@ else
     echo "Not found requirements.txt file."
 fi
 # install packages
-pip install lm-eval
+pip install lm-eval==0.4.2
 pip install accelerate nlpaug nltk schema optimum-intel optimum peft
 pip install --upgrade --force-reinstall transformers==4.36.2
 pip install optimum-habana
diff --git a/.github/workflows/script/install_binary.sh b/.github/workflows/script/install_binary.sh
index bbd6b7df2f1..7bca0d4d2f3 100644
--- a/.github/workflows/script/install_binary.sh
+++ b/.github/workflows/script/install_binary.sh
@@ -4,6 +4,7 @@ source /intel-extension-for-transformers/.github/workflows/script/change_color.s
 cd /intel-extension-for-transformers
 export CMAKE_ARGS="-DNE_DNNL_CACHE_DIR=/cache"
 pip install -U pip
+pip install -r requirements.txt
 $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 git config --global --add safe.directory "*"
 git submodule update --init --recursive
diff --git a/docs/api_doc/optimization/config.rst b/docs/api_doc/optimization/config.rst
index 435f5d6ddad..2ca607c03dc 100644
--- a/docs/api_doc/optimization/config.rst
+++ b/docs/api_doc/optimization/config.rst
@@ -4,7 +4,4 @@ Config
 .. autoapisummary::
    intel_extension_for_transformers.transformers.utils.metrics
    intel_extension_for_transformers.transformers.utils.objectives
-   intel_extension_for_transformers.transformers.config
-   intel_extension_for_transformers.transformers.quantization
-   intel_extension_for_transformers.transformers.distillation
-   intel_extension_for_transformers.transformers.pruning
+   intel_extension_for_transformers.transformers.utils.config
diff --git a/docs/api_doc/optimization/tf_optimization.rst b/docs/api_doc/optimization/tf_optimization.rst
deleted file mode 100644
index 3aa7cb7864a..00000000000
--- a/docs/api_doc/optimization/tf_optimization.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-TensorFlow Optimizer
-==============
-
-.. autoapisummary::
-
-   intel_extension_for_transformers.transformers.optimizer_tf
diff --git a/docs/api_doc/user_api.rst b/docs/api_doc/user_api.rst
index 712132f5d55..80a7ead6078 100644
--- a/docs/api_doc/user_api.rst
+++ b/docs/api_doc/user_api.rst
@@ -7,7 +7,5 @@ The following Python API information is available:
    :maxdepth: 1
 
    optimization/trainer.rst
-   optimization/optimizer.rst
    optimization/model.rst
-   optimization/tf_optimization.rst
    optimization/config.rst
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
index 1c1573ceb41..791dc367033 100644
--- a/examples/.config/pytorch_optimize.json
+++ b/examples/.config/pytorch_optimize.json
@@ -2450,6 +2450,58 @@
       }
     }
   },
+  "phi_2b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "phi_2b",
+        "task": "generation",
+        "approach": "static",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "phi_2b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "phi_3b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "phi_3b",
+        "task": "generation",
+        "approach": "static",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "phi_3b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
   "flan-t5-large_gen_ipex_static": {
     "working_dir": "huggingface/pytorch/text2text-generation",
     "tune": {
diff --git a/examples/.config/tensorflow_optimize.json b/examples/.config/tensorflow_optimize.json
deleted file mode 100644
index ab0aacaf6ce..00000000000
--- a/examples/.config/tensorflow_optimize.json
+++ /dev/null
@@ -1,255 +0,0 @@
-{
-  "bert_base_mrpc_static": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "bert_base_mrpc_static",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "bert_base_mrpc_static",
-        "mode": "accuracy",
-        "batch_size": "64",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "distilgpt2_clm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilgpt2_clm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilgpt2_clm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "distilbert_mlm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_mlm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_mlm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "bert_base_ner": {
-    "working_dir": "huggingface/tensorflow/token-classification/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "bert_base_ner",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "bert_base_ner",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification"
-      }
-    }
-  },
-  "distilbert_qa": {
-    "working_dir": "huggingface/tensorflow/question-answering/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_qa",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_qa",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    }
-  },
-  "distilbert_swag": {
-    "working_dir": "huggingface/tensorflow/multiple-choice/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_swag",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_swag",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice"
-      }
-    }
-  },
-  "roberta_qa": {
-    "working_dir": "huggingface/tensorflow/question-answering/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "roberta_qa",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "roberta_qa",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    }
-  },
-  "distilroberta_mlm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilroberta_mlm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilroberta_mlm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "legalbert_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "legalbert_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "legalbert_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "xlnet_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "xlnet_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "xlnet_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "albert_large_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "albert_large_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "albert_large_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  }
-}
diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt
index 62d596fa100..3f7fca6d65d 100644
--- a/examples/huggingface/neural_speed/requirements.txt
+++ b/examples/huggingface/neural_speed/requirements.txt
@@ -1,12 +1,11 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval
+lm-eval==0.4.2
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.3.0+cpu
 transformers
-intel_extension_for_pytorch==2.3.0
 tiktoken
 transformers_stream_generator
-zipfile38
\ No newline at end of file
+zipfile38
diff --git a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
index e87bc861ca8..cd6cd604899 100644
--- a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
@@ -1,4 +1,4 @@
 transformers 
 accelerate
 sentencepiece != 0.1.92
-lm-eval
+lm-eval==0.4.2
diff --git a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
index b60bac56d76..a1ea63132a8 100644
--- a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
@@ -7,5 +7,5 @@ transformers
 torch==2.0.1
 tqdm
 neural_compressor
-lm-eval
+lm-eval==0.4.2
 
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
index c7b5b6fcf83..36ee5a1b55a 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
@@ -9,5 +9,5 @@ wandb
 einops
 neural-compressor
 pytest==8.0.0
-lm-eval
+lm-eval==0.4.2
 git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a
diff --git a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
index 6cf73c3deae..5d46e354048 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
index 9374620302a..f3645880317 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
@@ -468,7 +468,7 @@ def preprocess_function(examples):
 
     # Get the metric function
     if data_args.task_name is not None:
-        metric = load_metric("glue", data_args.task_name)
+        metric = load_metric("glue", data_args.task_name,trust_remote_code=True)
     else:
         metric = load_metric("accuracy")
 
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
index bf77c9ece9a..1ed8c54b1ce 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -168,13 +168,18 @@ function run_benchmark {
         model_name_or_path="Intel/neural-chat-7b-v3"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1b" ]; then
-        model_name_or_path="susnato/phi-1_dev"
-	    pip install transformers==4.36.1
+        model_name_or_path="microsoft/phi-1"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1_5b" ]; then
-        model_name_or_path="susnato/phi-1_5_dev"
-	    pip install transformers==4.36.1
+        model_name_or_path="microsoft/phi-1_5"
         script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_2b" ]; then
+        model_name_or_path="microsoft/phi-2"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_3b" ]; then
+        model_name_or_path="microsoft/Phi-3-mini-4k-instruct"
+        script="run_generation_sq.py"
+        extra_cmd=$extra_cmd" --trust_remote_code"
     elif [ "${topology}" = "llama2_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
         script="run_generation_cpu_woq.py"
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
index db3a06767c9..c89f84e8f59 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
@@ -216,14 +216,25 @@ function run_tuning {
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1b" ]; then
         alpha=0.5
-        model_name_or_path="susnato/phi-1_dev"
+        model_name_or_path="microsoft/phi-1"
         extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
-        extra_cmd=$extra_cmd" --trust_remote_code"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1_5b" ]; then
         alpha=0.5
-        model_name_or_path="susnato/phi-1_5_dev"
+        model_name_or_path="microsoft/phi-1_5"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_2b" ]; then
+        alpha=0.5
+        model_name_or_path="microsoft/phi-2"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_3b" ]; then
+        alpha=0.5
+        model_name_or_path="microsoft/Phi-3-mini-4k-instruct"
         extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
diff --git a/examples/huggingface/pytorch/text2text-generation/requirements.txt b/examples/huggingface/pytorch/text2text-generation/requirements.txt
index 8a585f9fd9e..73e4ae2e655 100644
--- a/examples/huggingface/pytorch/text2text-generation/requirements.txt
+++ b/examples/huggingface/pytorch/text2text-generation/requirements.txt
@@ -11,4 +11,4 @@ neural-compressor
 optimum-intel > 1.12.0
 onnxruntime
 intel-extension-for-pytorch
-lm-eval
+lm-eval==0.4.2
diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt
index bc7a3e65de6..b04bd189db0 100644
--- a/examples/modelscope/requirements.txt
+++ b/examples/modelscope/requirements.txt
@@ -1,6 +1,6 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval
+lm-eval==0.4.2
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
index 6097d2e2a0d..7b38113697b 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
@@ -7,7 +7,7 @@ fastapi
 fschat==0.2.32
 huggingface_hub
 intel_extension_for_pytorch==2.3.0
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 neural_speed==1.0a0
 numpy==1.23.5
diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
index 1c6dfa0d47a..f3983b6d3c5 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
@@ -4,7 +4,7 @@ evaluate
 fastapi
 fschat==0.2.35
 huggingface_hub
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/requirements_win.txt b/intel_extension_for_transformers/neural_chat/requirements_win.txt
index c417c5ca01a..56ac6027ab4 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_win.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_win.txt
@@ -6,7 +6,7 @@ fastapi
 fschat==0.2.35
 huggingface_hub
 intel-extension-for-transformers
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
index a4243865087..97a46d2e502 100644
--- a/intel_extension_for_transformers/neural_chat/tests/requirements.txt
+++ b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -38,7 +38,7 @@ langchain-community==0.0.27
 langchain_core==0.1.35
 langid
 librosa
-lm-eval
+lm-eval==0.4.2
 markdown
 neural-compressor
 neural_speed==1.0a0
diff --git a/intel_extension_for_transformers/qbits/__init__.py b/intel_extension_for_transformers/qbits/__init__.py
index c23599090dc..5cd39d26a7f 100644
--- a/intel_extension_for_transformers/qbits/__init__.py
+++ b/intel_extension_for_transformers/qbits/__init__.py
@@ -16,5 +16,6 @@
 # limitations under the License.
 
 import torch
-if not torch.xpu._is_compiled():
-    from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611
+import intel_extension_for_transformers
+if "gpu" not in intel_extension_for_transformers.__version__:
+    from intel_extension_for_transformers.qbits_py import *  # pylint: disable=E0401, E0611
diff --git a/intel_extension_for_transformers/transformers/config.py b/intel_extension_for_transformers/transformers/config.py
index a0009e7d3ed..f5918267491 100644
--- a/intel_extension_for_transformers/transformers/config.py
+++ b/intel_extension_for_transformers/transformers/config.py
@@ -19,7 +19,7 @@
 import yaml
 from enum import Enum
 
-from neural_compressor.conf.dotdict import DotDict
+from neural_compressor.utils.utility import DotDict
 from .utils.metrics import Metric
 from .utils.objectives import Objective, performance
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index 7092d85452f..f90203d5ee4 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -935,9 +935,263 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
                 or device_map == torch.device("cpu")
             ) and model.config.model_type == "chatglm":
                 model = model.float()
+            if (
+                not torch.cuda.is_available()
+                or device_map == "cpu"
+                or device_map == torch.device("cpu")
+            ) and model.config.model_type == "mpt":
+                model.config.architectures = ["MptForCausalLM"]
             model.eval()
             logger.info("Applying SmoothQuant.")
+<<<<<<< HEAD
             model = convert_to_smoothquant_model(model, quantization_config)
+=======
+            # ipex.optimize_transformers
+            if quantization_config.ipex_opt_llm is None:
+                if model_type in IPEX_OPT_LLM_SUPPORTED:
+                    quantization_config.ipex_opt_llm = True
+                    logger.info(
+                        "quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used."
+                    )
+                    logger.warning("The suggested transformers version is 4.38.1.")
+                else:
+                    quantization_config.ipex_opt_llm = False
+            if quantization_config.ipex_opt_llm:
+                qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5)
+                model = ipex.optimize_transformers(
+                    model.eval(),
+                    quantization_config=qconfig,
+                    dtype=torch.float32,
+                    inplace=True,
+                    deployment_mode=False,
+                )
+                model.eval()
+
+            # past_key_values
+            num_beams = quantization_config.num_beams
+            if quantization_config.ipex_opt_llm:
+                past_key_values = generate_dummy_past_key_values_for_opt_llm(
+                    config=model.config, input_bs=1, num_beams=num_beams
+                )
+            else:
+                past_key_values = generate_dummy_past_key_values(
+                    config=model.config, input_bs=1
+                )
+
+            # calibration function
+            calib_func = quantization_config.calib_func
+            tokenizer = quantization_config.tokenizer
+            if calib_func is None:
+                if quantization_config.tokenizer is None:
+                    logger.error(
+                        "Please provide the tokenizer or provide calib_func directly,"
+                        + " the following is how to get tokenizer. \n"
+                        + " from transformer import AutoTokenizer \n"
+                        + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
+                    )
+                    exit(0)
+
+                from datasets import load_dataset
+                from torch.utils.data import DataLoader
+
+                calib_dataset = quantization_config.calib_dataset
+                calib_shuffle = quantization_config.calib_shuffle
+                calib_iters = quantization_config.calib_iters
+                calib_padding = quantization_config.calib_padding
+                calib_len = quantization_config.calib_len
+                calib_pad_val = quantization_config.calib_pad_val
+                from torch.nn.functional import pad
+
+                calib_dataset = load_dataset(
+                    calib_dataset,
+                    split=(
+                        "test"
+                        if calib_dataset in ["mbpp", "openai_humaneval"]
+                        else "train"
+                    ),
+                )
+                if calib_shuffle:
+                    calib_dataset = calib_dataset.shuffle(seed=42)
+
+                def tokenize_function(examples):
+                    if "code" in examples:
+                        example = tokenizer(examples["code"])
+                    elif "prompt" in examples:
+                        example = tokenizer(examples["prompt"])
+                    elif "text" in examples:
+                        example = tokenizer(examples["text"])
+                    else:
+                        logger.error(
+                            "Please check dataset prompt identifier,"
+                            + " NeelNanda/pile-10k is default used calibration dataset."
+                        )
+                        exit(0)
+                    return example
+
+                def collate_batch(batch):
+                    position_ids_padded = []
+                    input_ids_padded = []
+                    last_ind = []
+                    attention_mask_padded = []
+                    for text in batch:
+                        input_ids = text["input_ids"]
+                        if not calib_padding:
+                            input_ids = (
+                                input_ids[: int(calib_len)]
+                                if len(input_ids) > int(calib_len)
+                                else input_ids
+                            )  # no_padding
+                        else:
+                            pad_len = calib_len - input_ids.shape[0]
+                            input_ids = pad(
+                                input_ids, (0, pad_len), value=calib_pad_val
+                            )
+
+                        last_ind.append(input_ids.shape[0] - 1)
+                        attention_mask = torch.ones(len(input_ids))
+                        position_ids = torch.arange(len(input_ids))
+                        input_ids_padded.append(input_ids)
+                        attention_mask_padded.append(attention_mask)
+                        position_ids_padded.append(position_ids)
+                    if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                        return (
+                            {
+                                "input_ids": torch.vstack(input_ids_padded),
+                                "attention_mask": torch.vstack(attention_mask_padded),
+                                "position_ids": torch.vstack(position_ids_padded),
+                                "past_key_values": past_key_values,
+                            },
+                            torch.tensor(last_ind),
+                        )
+                    else:
+                        return (
+                            {
+                                "input_ids": torch.vstack(input_ids_padded),
+                                "attention_mask": torch.vstack(attention_mask_padded),
+                                "past_key_values": past_key_values,
+                            },
+                            torch.tensor(last_ind),
+                        )
+
+                def collate_batch_for_chatglm(batch):
+                    last_ind = []
+                    for text in batch:
+                        input_ids = torch.vstack([text["input_ids"]])
+                        if re.search(
+                            "THUDM/chatglm-6b", model.config.auto_map["AutoConfig"]
+                        ):
+                            input_ids = (
+                                input_ids[:, :calib_len]
+                                if input_ids.shape[1] > calib_len
+                                else input_ids
+                            )
+                            eos = torch.tensor([130001, 130004]).repeat(1, 1)
+                            input_ids = torch.cat((input_ids, eos), 1)
+                        else:
+                            input_ids = (
+                                input_ids[:, :calib_len]
+                                if input_ids.shape[1] > calib_len
+                                else input_ids
+                            )
+                        prepared_inputs = model.prepare_inputs_for_generation(input_ids)
+                        attention_mask = torch.ones_like(input_ids)
+                        last_ind.append(input_ids.shape[1] - 1)
+                    return (
+                        {
+                            "input_ids": input_ids,
+                            "attention_mask": attention_mask,
+                            "position_ids": prepared_inputs["position_ids"],
+                            "past_key_values": past_key_values,
+                        },
+                        torch.tensor(last_ind),
+                    )
+
+                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
+                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
+                if model_type == "chatglm":
+                    calib_dataloader = DataLoader(
+                        tokenized_dataset,
+                        batch_size=1,
+                        shuffle=False,
+                        collate_fn=collate_batch_for_chatglm,
+                    )
+                else:
+                    calib_dataloader = DataLoader(
+                        tokenized_dataset,
+                        batch_size=1,
+                        shuffle=False,
+                        collate_fn=collate_batch,
+                    )
+
+                def calib_func(model):
+                    with torch.no_grad():
+                        for i, (inputs, last_ind) in enumerate(calib_dataloader):
+                            if i >= calib_iters:
+                                break
+                            if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                                model(
+                                    input_ids=inputs["input_ids"],
+                                    past_key_values=inputs["past_key_values"],
+                                    position_ids=inputs["position_ids"],
+                                    attention_mask=inputs["attention_mask"],
+                                )
+                            else:
+                                model(
+                                    input_ids=inputs["input_ids"],
+                                    past_key_values=inputs["past_key_values"],
+                                    attention_mask=inputs["attention_mask"],
+                                )
+
+                logger.info(
+                    "The default calibration function is used, "
+                    + "the calibration dataset is NeelNanda/pile-10k, "
+                    + "batchsize is 1 and calibration iteration is 100."
+                )
+                calib_func = calib_func
+
+            # example_inputs
+            example_inputs = quantization_config.example_inputs
+            if example_inputs is None:
+                for i, (inputs, last_ind) in enumerate(calib_dataloader):
+                    if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                        example_inputs = {
+                            "input_ids": inputs["input_ids"],
+                            "attention_mask": inputs["attention_mask"],
+                            "position_ids": inputs["position_ids"],
+                            "past_key_values": inputs["past_key_values"],
+                        }
+                    else:
+                        example_inputs = {
+                            "input_ids": inputs["input_ids"],
+                            "attention_mask": inputs["attention_mask"],
+                            "past_key_values": inputs["past_key_values"],
+                        }
+                    break
+
+            # call inc sq
+            from neural_compressor import PostTrainingQuantConfig, quantization
+
+            conf = PostTrainingQuantConfig(
+                backend=quantization_config.backend,  # default is ipex
+                excluded_precisions=quantization_config.excluded_precisions,
+                op_type_dict=quantization_config.op_type_dict,
+                op_name_dict=quantization_config.op_name_dict,
+                recipes=quantization_config.recipes,
+                example_inputs=example_inputs,
+            )
+
+            model = quantization.fit(
+                model,
+                conf,
+                calib_func=calib_func,
+                calib_dataloader=(
+                    calib_dataloader
+                    if quantization_config.recipes["smooth_quant_args"]["alpha"]
+                    == "auto"
+                    else None
+                ),
+            )
+>>>>>>> main
             logger.info("SmoothQuant done.")
         elif isinstance(quantization_config, DynamicQuantConfig):
             model = cls.ORIG_MODEL.from_pretrained(
diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py
index 527d8b097ff..5f353b296ed 100644
--- a/intel_extension_for_transformers/transformers/utils/utility.py
+++ b/intel_extension_for_transformers/transformers/utils/utility.py
@@ -95,3 +95,410 @@ def __init__(self) -> None:
             self.dataset = dataloader.dataset
 
     return INCDataLoader()
+<<<<<<< HEAD
+=======
+
+
+def generate_dummy_past_key_values(config, input_bs):
+    """Generate the dummy past_key_values."""
+    from optimum.utils import NormalizedConfigManager
+    if config.model_type == "qwen":
+        new_shape = [
+            input_bs,
+            0,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "baichuan":
+        new_shape = [
+            input_bs,
+            config.num_attention_heads,
+            0,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "chatglm":
+        new_shape = [
+            0,
+            input_bs,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_layers
+    else:
+        normalized_config = NormalizedConfigManager.get_normalized_config_class(
+            config.model_type
+        )(config)
+        nb_pkv = 2
+        num_layers = normalized_config.num_layers
+        num_attention_heads = normalized_config.num_attention_heads
+        hidden_size = normalized_config.hidden_size
+        d_k = hidden_size // num_attention_heads
+        num_key_value_heads = num_attention_heads
+        if hasattr(normalized_config, "num_key_value_heads"):
+            num_key_value_heads = normalized_config.num_key_value_heads
+        if hasattr(normalized_config, "multi_query_group_num"):
+            num_key_value_heads = normalized_config.multi_query_group_num
+
+        if config.model_type == "bloom":
+            shape_key = (input_bs * num_attention_heads, d_k, 1)
+            shape_value = (input_bs * num_attention_heads, 1, d_k)
+            key = torch.ones(size=shape_key)
+            value = torch.ones(size=shape_value)
+            past_key_values = tuple(
+                tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
+                for _ in range(num_layers)
+            )
+            return past_key_values
+        elif config.model_type == "gpt_bigcode":
+            new_shape = [input_bs, 0, d_k * 2]
+            dummy_tensor = torch.zeros(size=new_shape)
+            past_key_values = tuple([dummy_tensor] * num_layers)
+            return past_key_values
+        elif config.model_type == "falcon":
+            new_shape = [input_bs, 1, 0, d_k]
+        else:
+            new_shape = [input_bs, num_key_value_heads, 0, d_k]
+    past_key_values = [
+        (
+            torch.zeros(size=new_shape).contiguous(),
+            torch.zeros(size=new_shape).contiguous(),
+        )
+        for _ in range(num_layers)
+    ]
+    return tuple(past_key_values)
+
+def generate_dummy_past_key_values_for_inference(config, input_bs):
+    """Generate the dummy past_key_values."""
+    from optimum.utils import NormalizedConfigManager
+    if config.model_type == "qwen":
+        new_shape = [
+            input_bs,
+            0,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "baichuan":
+        new_shape = [
+            input_bs,
+            config.num_attention_heads,
+            0,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "chatglm":
+        new_shape = [
+            0,
+            input_bs,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_layers
+    else:
+        normalized_config = NormalizedConfigManager.get_normalized_config_class(
+            config.model_type
+        )(config)
+        nb_pkv = 2
+        num_layers = normalized_config.num_layers
+        num_attention_heads = normalized_config.num_attention_heads
+        hidden_size = normalized_config.hidden_size
+        d_k = hidden_size // num_attention_heads
+        num_key_value_heads = num_attention_heads
+        if hasattr(normalized_config, "num_key_value_heads"):
+            num_key_value_heads = normalized_config.num_key_value_heads
+        if hasattr(normalized_config, "multi_query_group_num"):
+            num_key_value_heads = normalized_config.multi_query_group_num
+
+        if config.model_type == "bloom":
+            shape_key = (input_bs * num_attention_heads, d_k, 0)
+            shape_value = (input_bs * num_attention_heads, 0, d_k)
+            key = torch.empty(size=shape_key)
+            value = torch.empty(size=shape_value)
+            past_key_values = tuple(
+                tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv))
+                for _ in range(num_layers)
+            )
+            return past_key_values
+        elif config.model_type == "gpt_bigcode":
+            new_shape = [input_bs, 0, d_k * 2]
+            dummy_tensor = torch.zeros(size=new_shape)
+            past_key_values = tuple([dummy_tensor] * num_layers)
+            return past_key_values
+        elif config.model_type == "falcon":
+            new_shape = [input_bs, 1, 0, d_k]
+        else:
+            new_shape = [input_bs, num_key_value_heads, 0, d_k]
+    past_key_values = [
+        (
+            torch.zeros(size=new_shape).contiguous(),
+            torch.zeros(size=new_shape).contiguous(),
+        )
+        for _ in range(num_layers)
+    ]
+    return tuple(past_key_values)
+
+def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
+    """Generate the dummy past_key_values."""
+    from optimum.utils import NormalizedConfigManager
+    if config.model_type == "qwen":
+        new_shape = [
+            input_bs,
+            1,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "baichuan":
+        new_shape = [
+            input_bs,
+            config.num_attention_heads,
+            1,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_hidden_layers
+    elif config.model_type == "chatglm":
+        new_shape = [
+            1,
+            input_bs,
+            config.num_attention_heads,
+            config.hidden_size // config.num_attention_heads,
+        ]
+        num_layers = config.num_layers
+    else:
+        normalized_config = NormalizedConfigManager.get_normalized_config_class(
+            config.model_type
+        )(config)
+        num_layers = normalized_config.num_layers
+        num_attention_heads = normalized_config.num_attention_heads
+        hidden_size = normalized_config.hidden_size
+        d_k = hidden_size // num_attention_heads
+        num_key_value_heads = num_attention_heads
+        nb_pkv = 2
+        if hasattr(normalized_config, "num_key_value_heads"):
+            num_key_value_heads = normalized_config.num_key_value_heads
+        if hasattr(normalized_config, "multi_query_group_num"):
+            num_key_value_heads = normalized_config.multi_query_group_num
+        if config.model_type == "bloom":
+            for nb_pkv in range(nb_pkv):
+                if nb_pkv % 2 == 0:
+                    new_shape = [input_bs * num_key_value_heads, d_k, 1]
+                else:
+                    new_shape = [input_bs * num_key_value_heads, 1, d_k]
+
+        else:
+            new_shape = [input_bs, num_key_value_heads, 1, d_k]
+
+    beam_idx_tmp = torch.zeros(
+        (2048, int(input_bs * num_beams)), dtype=torch.long
+    ).contiguous()
+    past_key_values = [
+        (
+            torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
+            torch.zeros(size=new_shape).contiguous(),
+            torch.zeros(size=new_shape).contiguous(),
+            beam_idx_tmp,
+        )
+        for _ in range(num_layers)
+    ]
+    return tuple(past_key_values)
+
+IPEX_OPT_LLM_SUPPORTED_DICT = {
+    "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"],
+    "2.3": [
+        "gptj",
+        "opt",
+        "llama",
+        "falcon",
+        "chatglm",
+        "baichuan",
+        "qwen",
+        "bloom",
+        "codegen",
+        "gptbigcode",
+        "t5",
+        "mixtral",
+        "mpt",
+    ],
+}
+
+MODEL_TYPES_REQUIRING_POSITION_IDS = {
+    "codegen",
+    "gpt2",
+    "gpt-bigcode",
+    "gpt-neo",
+    "gpt-neox",
+    "gptj",
+    "imagegpt",
+    "llama",
+    "mistral",
+    "chatglm",
+}
+
+if is_ipex_available() and ipex.__version__ == "2.2.0+cpu":
+    logger.info(
+        "ipex.llm.optimize by 2.2.0 version supported model family: {}".format(
+            ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])
+            )
+    )
+    logger.info(
+        "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version."
+    )
+    IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]
+elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu":
+    logger.info(
+        "ipex.llm.optimize by 2.3.0 version supported model family: {}".format(
+            ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])
+        )
+    )
+    logger.info(
+        "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version."
+    )
+    IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
+else:
+    logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.")
+    IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
+
+def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4):
+    """Generate the dummy example inputs."""
+    prompt = "Welcome to use Intel Extension for Transformers."
+    prompt = [prompt] * batch_size
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    model_type = model_config.model_type.replace("_", "-")
+    if model_type in IPEX_OPT_LLM_SUPPORTED:
+        past_key_values = generate_dummy_past_key_values_for_opt_llm(
+                                                                    config=model_config,
+                                                                    input_bs=batch_size,
+                                                                    num_beams=num_beams
+                                                                    )
+    else:
+        past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size)
+
+    input_ids = input_ids[:, :512]
+    attention_mask = torch.ones(input_ids.shape)
+    position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1)
+
+    if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
+        example_inputs = {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "position_ids": position_ids,
+                    "past_key_values": past_key_values
+                }
+    else:
+        example_inputs = {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "past_key_values": past_key_values
+                }
+    return example_inputs
+
+
+def make_torchscript_model(model, json_file_path, example_inputs):
+    """Recover ipex model from JSON file.
+
+    Args:
+        model (object): fp32 model need to do quantization.
+        json_file_path (json): configuration JSON file for ipex.
+        example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function.
+
+    Returns:
+        (object): quantized model
+    """
+
+    ipex = LazyImport("intel_extension_for_pytorch")
+    from torch.ao.quantization.observer import MinMaxObserver
+
+    if ipex.__version__ >= "2.1.100":
+        qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver)
+    else:
+        qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver())
+    if isinstance(example_inputs, dict):
+        model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True)
+    else:
+        model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True)
+    model.load_qconf_summary(qconf_summary=json_file_path)
+    model = ipex.quantization.convert(model, inplace=True)
+    model.eval()
+    with torch.no_grad():
+        try:
+            if isinstance(example_inputs, dict):
+                # pylint: disable=E1120,E1123
+                model = torch.jit.trace(model, example_kwarg_inputs=example_inputs)
+            else:
+                model = torch.jit.trace(model, example_inputs)
+            model = torch.jit.freeze(model.eval())
+        except:
+            if isinstance(example_inputs, dict):
+                # pylint: disable=E1120,E1123
+                model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False)
+            else:
+                model = torch.jit.trace(model, example_inputs, strict=False)
+            model = torch.jit.freeze(model.eval())
+        if isinstance(example_inputs, dict):
+            model(**example_inputs)
+            model(**example_inputs)
+        elif isinstance(example_inputs, tuple) or isinstance(example_inputs, list):
+            model(*example_inputs)
+            model(*example_inputs)
+        else:
+            model(example_inputs)
+            model(example_inputs)
+    return model
+
+def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False):
+    """Recover ipex model from JSON file.
+
+    Args:
+        model (object): fp32 model need to do quantization.
+        json_file_path (json): configuration JSON file for ipex.
+        trust_remote_code (bool): trust remote code.
+
+    Returns:
+        (object): quantized model
+    """
+    from transformers import AutoModelForCausalLM
+
+    # ipex recovered int8 model from configure.json requests float32 model input and on cpu device.
+    user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path,
+                                                      trust_remote_code=trust_remote_code).float()
+    if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED:
+        import intel_extension_for_pytorch as ipex
+        qconfig = ipex.quantization.default_static_qconfig_mapping
+        user_model = ipex.optimize_transformers(
+            user_model.eval(),
+            dtype=torch.float,
+            inplace=True,
+            quantization_config=qconfig,
+            deployment_mode=False,
+        )
+
+    # tokenizer
+    if user_model.config.model_type == "llama":
+        from transformers import LlamaTokenizer
+        tokenizer = LlamaTokenizer.from_pretrained(user_model.config.name_or_path)
+    else:
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            user_model.config.name_or_path, trust_remote_code=trust_remote_code
+        )
+
+    # example_inputs
+    example_inputs = get_example_inputs(user_model.config, tokenizer=tokenizer)
+
+    # pylint: disable=E0611
+    user_model.config.torchscript = True
+    config = user_model.config
+    user_model = make_torchscript_model(user_model, json_file_path, example_inputs)
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_transformers.transformers.llm.evaluation.models import (
+        TSModelCausalLMForITREX,
+    )
+    origin_model_type = config.model_type
+    if origin_model_type in ["chatglm", "qwen", "baichuan"]:
+        config.model_type = "qwen2"
+    user_model = TSModelCausalLMForITREX(user_model, config=config)
+    user_model.config.model_type = origin_model_type
+    return user_model
+>>>>>>> main
diff --git a/intel_extension_for_transformers/transformers/utils/utility_tf.py b/intel_extension_for_transformers/transformers/utils/utility_tf.py
deleted file mode 100644
index f19785740af..00000000000
--- a/intel_extension_for_transformers/transformers/utils/utility_tf.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utils for tensorflow framework."""
-
-import os
-import json
-from collections import OrderedDict, UserDict
-from neural_compressor.experimental import common
-
-TMPPATH = os.path.join('tmp', 'model')
-TEACHERPATH = os.path.join('tmp', 'teacher_model')
-class TFDataloader(object):
-    """Tensorflow dataloader.
-
-    Args:
-        dataset (string): Dataset
-    """
-
-    def __init__(self, dataset, batch_size=None):
-        """Init an instance."""
-        self.dataset = dataset
-        self.batch_size = batch_size
-
-    def __iter__(self):
-        """Get the iteration of dataset."""
-        for inputs, labels in self.dataset:
-            if isinstance(inputs, dict) or isinstance(inputs, OrderedDict) \
-                  or isinstance(inputs, UserDict):
-                for name in inputs.keys():
-                    inputs[name] = inputs[name].numpy()
-            elif isinstance(inputs, list) or isinstance(inputs, tuple):
-                inputs = [input.numpy() for input in inputs]
-            else:
-                inputs = inputs.numpy()
-
-            if isinstance(labels, dict) or isinstance(labels, OrderedDict) \
-                  or isinstance(labels, UserDict):   # pragma: no cover
-                for name in labels.keys():
-                    labels[name] = labels[name].numpy()
-            elif isinstance(labels, list) or isinstance(labels, tuple):
-                labels = [label.numpy() for label in labels]
-            else:
-                labels = labels.numpy()
-            yield inputs, labels
-
-    def __len__(self):
-        """Return the length of dataset."""
-        return len(self.dataset)
-
-
-def distributed_init(worker_addresses, type='worker', index=0):
-    """Init distribute environment.
-
-    Args:
-        worker_addresses: Addresses of all nodes.
-        type: The type of node, such as worker.
-        index: When index is 0, the node treat as a chief.
-    """
-    tf_config = {
-        'cluster': {
-            'worker': worker_addresses
-        },
-        'task': {'type': type, 'index': index}
-    }
-    os.environ['TF_CONFIG'] = json.dumps(tf_config)
-
-def _is_chief(task_type, task_id):
-    # here only consider the case in which TF_CONFIG task_type is set as worker
-    # and task_id=0 represents the chief
-    return (task_type == 'worker' and task_id == 0)
-
-# get model folder path for the distributed environment
-def get_filepath(base_dirpath, task_type, task_id):
-    """Get model folder path for the distributed environment.
-
-    Args:
-        base_dirpath: The basic folder path.
-        task_type: Task_type is set as worker.
-        task_id: Task id. When task_id=0, the node treat as a chief.
-    """
-    if task_type is None:    # single node
-        return base_dirpath
-    elif _is_chief(task_type, task_id):
-        return os.path.join(base_dirpath, 'chief')
-    else:
-        return os.path.join(base_dirpath, 'worker_' + str(task_id))
-
-
-# convert a Keras model to SavedModel
-def keras2SavedModel(model):   # pragma: no cover
-    """Transfer keras model into save_model."""
-    model = common.Model(model)
-    return model.model
diff --git a/setup.py b/setup.py
index 17700afeeb3..13aec7b7025 100644
--- a/setup.py
+++ b/setup.py
@@ -8,10 +8,12 @@
 from pathlib import Path
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
 
 result = subprocess.Popen("pip install -r requirements.txt", shell=True)
 result.wait()
 
+
 def is_intel_gpu_available():
     import torch
     import intel_extension_for_pytorch as ipex
@@ -286,6 +288,9 @@ def check_submodules():
                            "intel_extension_for_transformers/transformers/runtime/"),
         ])
     cmdclass = {'build_ext': CMakeBuild}
+    itrex_version = get_version()
+    if IS_INTEL_GPU:
+        itrex_version = itrex_version + "-gpu"
 
     setup(
         name="intel-extension-for-transformers",
@@ -324,4 +329,5 @@ def check_submodules():
         ],
         setup_requires=['setuptools_scm'],
         use_scm_version=True,
+        version=itrex_version
     )