From f44bf957d5097d97be7ff9ee13f81923b3a0f51c Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Thu, 27 Jun 2024 10:59:31 +0800
Subject: [PATCH 1/9] Fix docs online build (#1637)

Signed-off-by: Wang, Chang <chang1.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/api_doc/optimization/config.rst          |   5 +-
 docs/api_doc/optimization/tf_optimization.rst |   6 -
 docs/api_doc/user_api.rst                     |   2 -
 examples/.config/tensorflow_optimize.json     | 255 ------------------
 4 files changed, 1 insertion(+), 267 deletions(-)
 delete mode 100644 docs/api_doc/optimization/tf_optimization.rst
 delete mode 100644 examples/.config/tensorflow_optimize.json

diff --git a/docs/api_doc/optimization/config.rst b/docs/api_doc/optimization/config.rst
index 435f5d6ddad..2ca607c03dc 100644
--- a/docs/api_doc/optimization/config.rst
+++ b/docs/api_doc/optimization/config.rst
@@ -4,7 +4,4 @@ Config
 .. autoapisummary::
    intel_extension_for_transformers.transformers.utils.metrics
    intel_extension_for_transformers.transformers.utils.objectives
-   intel_extension_for_transformers.transformers.config
-   intel_extension_for_transformers.transformers.quantization
-   intel_extension_for_transformers.transformers.distillation
-   intel_extension_for_transformers.transformers.pruning
+   intel_extension_for_transformers.transformers.utils.config
diff --git a/docs/api_doc/optimization/tf_optimization.rst b/docs/api_doc/optimization/tf_optimization.rst
deleted file mode 100644
index 3aa7cb7864a..00000000000
--- a/docs/api_doc/optimization/tf_optimization.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-TensorFlow Optimizer
-==============
-
-.. autoapisummary::
-
-   intel_extension_for_transformers.transformers.optimizer_tf
diff --git a/docs/api_doc/user_api.rst b/docs/api_doc/user_api.rst
index 712132f5d55..80a7ead6078 100644
--- a/docs/api_doc/user_api.rst
+++ b/docs/api_doc/user_api.rst
@@ -7,7 +7,5 @@ The following Python API information is available:
    :maxdepth: 1
 
    optimization/trainer.rst
-   optimization/optimizer.rst
    optimization/model.rst
-   optimization/tf_optimization.rst
    optimization/config.rst
diff --git a/examples/.config/tensorflow_optimize.json b/examples/.config/tensorflow_optimize.json
deleted file mode 100644
index ab0aacaf6ce..00000000000
--- a/examples/.config/tensorflow_optimize.json
+++ /dev/null
@@ -1,255 +0,0 @@
-{
-  "bert_base_mrpc_static": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "bert_base_mrpc_static",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "bert_base_mrpc_static",
-        "mode": "accuracy",
-        "batch_size": "64",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "distilgpt2_clm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilgpt2_clm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilgpt2_clm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "distilbert_mlm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_mlm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_mlm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "bert_base_ner": {
-    "working_dir": "huggingface/tensorflow/token-classification/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "bert_base_ner",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "bert_base_ner",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification"
-      }
-    }
-  },
-  "distilbert_qa": {
-    "working_dir": "huggingface/tensorflow/question-answering/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_qa",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_qa",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    }
-  },
-  "distilbert_swag": {
-    "working_dir": "huggingface/tensorflow/multiple-choice/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilbert_swag",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilbert_swag",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice"
-      }
-    }
-  },
-  "roberta_qa": {
-    "working_dir": "huggingface/tensorflow/question-answering/quantization",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "roberta_qa",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "roberta_qa",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering"
-      }
-    }
-  },
-  "distilroberta_mlm": {
-    "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "distilroberta_mlm",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "distilroberta_mlm",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling"
-      }
-    }
-  },
-  "legalbert_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "legalbert_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "legalbert_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "xlnet_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "xlnet_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "xlnet_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  },
-  "albert_large_mrpc": {
-    "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq",
-    "tune":{
-      "cmd": "bash run_tuning.sh",
-      "params": {
-        "topology": "albert_large_mrpc",
-        "output_model": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    },
-    "benchmark": {
-      "cmd": "bash run_benchmark.sh",
-      "params": {
-        "topology": "albert_large_mrpc",
-        "mode": "accuracy",
-        "batch_size": "16",
-        "iters": "100",
-        "int8": "false",
-        "config": "saved_results",
-        "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification"
-      }
-    }
-  }
-}

From b1168c13dedf5b110a786649ffcca615894fdb70 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Thu, 27 Jun 2024 11:30:51 +0800
Subject: [PATCH 2/9] Fix SQ bloom (#1636)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/llm/evaluation/models.py                   | 3 +--
 .../transformers/modeling/modeling_auto.py                  | 6 +-----
 .../transformers/utils/utility.py                           | 6 +-----
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/models.py b/intel_extension_for_transformers/transformers/llm/evaluation/models.py
index 98dc24e3673..61b301a380a 100644
--- a/intel_extension_for_transformers/transformers/llm/evaluation/models.py
+++ b/intel_extension_for_transformers/transformers/llm/evaluation/models.py
@@ -38,8 +38,7 @@ def _reorder_cache(
 
         This is required to match `past_key_values` with the correct beam_idx at every generation step.
         """
-        if self.config.model_type == "bloom":
-            return self._reorder_cache_bloom(past_key_values, beam_idx)
+
         if self.config.model_type == "chatglm":
             return tuple(
                 tuple(
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index c0a9925494a..263e4784d92 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -946,11 +946,7 @@ def collate_batch(batch):
                             )
 
                         last_ind.append(input_ids.shape[0] - 1)
-                        if model_type in ["bloom"]:
-                            attention_mask = torch.ones(len(input_ids) + 1)
-                            attention_mask[0] = 0
-                        else:
-                            attention_mask = torch.ones(len(input_ids))
+                        attention_mask = torch.ones(len(input_ids))
                         position_ids = torch.arange(len(input_ids))
                         input_ids_padded.append(input_ids)
                         attention_mask_padded.append(attention_mask)
diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py
index 2467531fab2..78fe5f2063d 100644
--- a/intel_extension_for_transformers/transformers/utils/utility.py
+++ b/intel_extension_for_transformers/transformers/utils/utility.py
@@ -375,11 +375,7 @@ def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4):
         past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size)
 
     input_ids = input_ids[:, :512]
-    if model_type in ["bloom", "qwen"]:
-        attention_mask = torch.ones(input_ids.shape[0], input_ids.shape[1] + 1)
-        attention_mask[:,0] = 0
-    else:
-        attention_mask = torch.ones(input_ids.shape)
+    attention_mask = torch.ones(input_ids.shape)
     position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1)
 
     if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:

From c47dcb8466ee1aee5a7fa6d5e47461331cb56db5 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Fri, 28 Jun 2024 09:29:26 +0800
Subject: [PATCH 3/9] Support phi series SQ (#1639)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/.config/pytorch_optimize.json        | 52 +++++++++++++++++++
 .../quantization/run_benchmark.sh             | 13 +++--
 .../quantization/run_tuning.sh                | 17 ++++--
 3 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
index 1c1573ceb41..791dc367033 100644
--- a/examples/.config/pytorch_optimize.json
+++ b/examples/.config/pytorch_optimize.json
@@ -2450,6 +2450,58 @@
       }
     }
   },
+  "phi_2b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "phi_2b",
+        "task": "generation",
+        "approach": "static",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "phi_2b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "phi_3b_gen_ipex_static": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "phi_3b",
+        "task": "generation",
+        "approach": "static",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "phi_3b",
+        "task": "generation",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
   "flan-t5-large_gen_ipex_static": {
     "working_dir": "huggingface/pytorch/text2text-generation",
     "tune": {
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
index c92e733e9fe..61cd923588b 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -166,13 +166,18 @@ function run_benchmark {
         model_name_or_path="Intel/neural-chat-7b-v3"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1b" ]; then
-        model_name_or_path="susnato/phi-1_dev"
-	    pip install transformers==4.36.1
+        model_name_or_path="microsoft/phi-1"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1_5b" ]; then
-        model_name_or_path="susnato/phi-1_5_dev"
-	    pip install transformers==4.36.1
+        model_name_or_path="microsoft/phi-1_5"
         script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_2b" ]; then
+        model_name_or_path="microsoft/phi-2"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_3b" ]; then
+        model_name_or_path="microsoft/Phi-3-mini-4k-instruct"
+        script="run_generation_sq.py"
+        extra_cmd=$extra_cmd" --trust_remote_code"
     elif [ "${topology}" = "llama2_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
         script="run_generation_cpu_woq.py"
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
index 7c3919a132a..7dfa912f90e 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
@@ -216,14 +216,25 @@ function run_tuning {
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1b" ]; then
         alpha=0.5
-        model_name_or_path="susnato/phi-1_dev"
+        model_name_or_path="microsoft/phi-1"
         extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
-        extra_cmd=$extra_cmd" --trust_remote_code"
         script="run_generation_sq.py"
     elif [ "${topology}" = "phi_1_5b" ]; then
         alpha=0.5
-        model_name_or_path="susnato/phi-1_5_dev"
+        model_name_or_path="microsoft/phi-1_5"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_2b" ]; then
+        alpha=0.5
+        model_name_or_path="microsoft/phi-2"
+        extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
+        extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
+        script="run_generation_sq.py"
+    elif [ "${topology}" = "phi_3b" ]; then
+        alpha=0.5
+        model_name_or_path="microsoft/Phi-3-mini-4k-instruct"
         extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"

From a9a0e9321ef31228e8f735374ba52ba7af3dc462 Mon Sep 17 00:00:00 2001
From: "Wang, Zhe" <zhe1.wang@intel.com>
Date: Fri, 28 Jun 2024 17:07:12 +0800
Subject: [PATCH 4/9] fix xpu version itrex detect (#1638)

---
 .github/workflows/script/install_binary.sh         | 1 +
 intel_extension_for_transformers/qbits/__init__.py | 5 +++--
 setup.py                                           | 6 ++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/script/install_binary.sh b/.github/workflows/script/install_binary.sh
index bbd6b7df2f1..7bca0d4d2f3 100644
--- a/.github/workflows/script/install_binary.sh
+++ b/.github/workflows/script/install_binary.sh
@@ -4,6 +4,7 @@ source /intel-extension-for-transformers/.github/workflows/script/change_color.s
 cd /intel-extension-for-transformers
 export CMAKE_ARGS="-DNE_DNNL_CACHE_DIR=/cache"
 pip install -U pip
+pip install -r requirements.txt
 $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 git config --global --add safe.directory "*"
 git submodule update --init --recursive
diff --git a/intel_extension_for_transformers/qbits/__init__.py b/intel_extension_for_transformers/qbits/__init__.py
index c23599090dc..5cd39d26a7f 100644
--- a/intel_extension_for_transformers/qbits/__init__.py
+++ b/intel_extension_for_transformers/qbits/__init__.py
@@ -16,5 +16,6 @@
 # limitations under the License.
 
 import torch
-if not torch.xpu._is_compiled():
-    from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611
+import intel_extension_for_transformers
+if "gpu" not in intel_extension_for_transformers.__version__:
+    from intel_extension_for_transformers.qbits_py import *  # pylint: disable=E0401, E0611
diff --git a/setup.py b/setup.py
index 17700afeeb3..13aec7b7025 100644
--- a/setup.py
+++ b/setup.py
@@ -8,10 +8,12 @@
 from pathlib import Path
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
 
 result = subprocess.Popen("pip install -r requirements.txt", shell=True)
 result.wait()
 
+
 def is_intel_gpu_available():
     import torch
     import intel_extension_for_pytorch as ipex
@@ -286,6 +288,9 @@ def check_submodules():
                            "intel_extension_for_transformers/transformers/runtime/"),
         ])
     cmdclass = {'build_ext': CMakeBuild}
+    itrex_version = get_version()
+    if IS_INTEL_GPU:
+        itrex_version = itrex_version + "-gpu"
 
     setup(
         name="intel-extension-for-transformers",
@@ -324,4 +329,5 @@ def check_submodules():
         ],
         setup_requires=['setuptools_scm'],
         use_scm_version=True,
+        version=itrex_version
     )

From 63056ece9ad6430b956ac995b61f519fd2b7b4a8 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Fri, 28 Jun 2024 17:54:14 +0800
Subject: [PATCH 5/9] Improve MPT series SQ (#1640)

Signed-off-by: Wang, Chang <chang1.wang@intel.com>
---
 .../transformers/modeling/modeling_auto.py                 | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index 263e4784d92..a5be8cdc519 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -840,6 +840,12 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
                 or device_map == torch.device("cpu")
             ) and model.config.model_type == "chatglm":
                 model = model.float()
+            if (
+                not torch.cuda.is_available()
+                or device_map == "cpu"
+                or device_map == torch.device("cpu")
+            ) and model.config.model_type == "mpt":
+                model.config.architectures = ["MptForCausalLM"]
             model.eval()
             model_type = model.config.model_type.replace("_", "-")
 
@@ -1077,6 +1083,7 @@ def calib_func(model):
                 recipes=quantization_config.recipes,
                 example_inputs=example_inputs,
             )
+
             model = quantization.fit(
                 model,
                 conf,

From 76aa9b16ee27325bc2db7d206757342284ed9148 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Fri, 28 Jun 2024 18:55:29 +0800
Subject: [PATCH 6/9] Fix Qwen neural speed (#1641)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/huggingface/neural_speed/requirements.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt
index 62d596fa100..3f7fca6d65d 100644
--- a/examples/huggingface/neural_speed/requirements.txt
+++ b/examples/huggingface/neural_speed/requirements.txt
@@ -1,12 +1,11 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval
+lm-eval==0.4.2
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.3.0+cpu
 transformers
-intel_extension_for_pytorch==2.3.0
 tiktoken
 transformers_stream_generator
-zipfile38
\ No newline at end of file
+zipfile38

From 816f475b1ec3f4d6c9c4328eb132b30af31c083f Mon Sep 17 00:00:00 2001
From: intellinjun <105184542+intellinjun@users.noreply.github.com>
Date: Fri, 28 Jun 2024 18:56:06 +0800
Subject: [PATCH 7/9] fix neural engine error (#1642)

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 .../emotion/distilbert_base_uncased/requirements.txt           | 3 ++-
 .../deployment/mrpc/bert_base/requirements.txt                 | 3 ++-
 .../deployment/mrpc/bert_base_cased/requirements.txt           | 3 ++-
 .../deployment/mrpc/bert_base_cased/run_glue.py                | 2 +-
 .../deployment/mrpc/bert_mini/requirements.txt                 | 3 ++-
 .../deployment/mrpc/distilbert_base_uncased/requirements.txt   | 3 ++-
 .../deployment/mrpc/roberta_base/requirements.txt              | 3 ++-
 .../deployment/sparse/bert_mini/requirements.txt               | 3 ++-
 .../deployment/sparse/distilbert_base_uncased/requirements.txt | 3 ++-
 .../deployment/sst2/bert_mini/requirements.txt                 | 3 ++-
 .../deployment/sst2/distilbert_base_uncased/requirements.txt   | 3 ++-
 .../deployment/sst2/minilm_l6_h384_uncased/requirements.txt    | 3 ++-
 12 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
index 6cf73c3deae..5d46e354048 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
index 9374620302a..f3645880317 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py
@@ -468,7 +468,7 @@ def preprocess_function(examples):
 
     # Get the metric function
     if data_args.task_name is not None:
-        metric = load_metric("glue", data_args.task_name)
+        metric = load_metric("glue", data_args.task_name,trust_remote_code=True)
     else:
         metric = load_metric("accuracy")
 
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0
diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
index 94b03297a4a..3e3e2e9d604 100644
--- a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
+++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt
@@ -1,4 +1,5 @@
-neural-compressor
+intel-extension-for-transformers==1.4.2
+neural-compressor==2.6
 transformers
 accelerate
 datasets >= 1.8.0

From 317b9133ccde44d82c99c4b312cf8e3e4afc925f Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Tue, 2 Jul 2024 13:13:46 +0800
Subject: [PATCH 8/9] Clean INC import (#1645)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/config.py                    |   2 +-
 .../transformers/utils/utility_tf.py          | 107 ------------------
 2 files changed, 1 insertion(+), 108 deletions(-)
 delete mode 100644 intel_extension_for_transformers/transformers/utils/utility_tf.py

diff --git a/intel_extension_for_transformers/transformers/config.py b/intel_extension_for_transformers/transformers/config.py
index a0009e7d3ed..f5918267491 100644
--- a/intel_extension_for_transformers/transformers/config.py
+++ b/intel_extension_for_transformers/transformers/config.py
@@ -19,7 +19,7 @@
 import yaml
 from enum import Enum
 
-from neural_compressor.conf.dotdict import DotDict
+from neural_compressor.utils.utility import DotDict
 from .utils.metrics import Metric
 from .utils.objectives import Objective, performance
 
diff --git a/intel_extension_for_transformers/transformers/utils/utility_tf.py b/intel_extension_for_transformers/transformers/utils/utility_tf.py
deleted file mode 100644
index f19785740af..00000000000
--- a/intel_extension_for_transformers/transformers/utils/utility_tf.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utils for tensorflow framework."""
-
-import os
-import json
-from collections import OrderedDict, UserDict
-from neural_compressor.experimental import common
-
-TMPPATH = os.path.join('tmp', 'model')
-TEACHERPATH = os.path.join('tmp', 'teacher_model')
-class TFDataloader(object):
-    """Tensorflow dataloader.
-
-    Args:
-        dataset (string): Dataset
-    """
-
-    def __init__(self, dataset, batch_size=None):
-        """Init an instance."""
-        self.dataset = dataset
-        self.batch_size = batch_size
-
-    def __iter__(self):
-        """Get the iteration of dataset."""
-        for inputs, labels in self.dataset:
-            if isinstance(inputs, dict) or isinstance(inputs, OrderedDict) \
-                  or isinstance(inputs, UserDict):
-                for name in inputs.keys():
-                    inputs[name] = inputs[name].numpy()
-            elif isinstance(inputs, list) or isinstance(inputs, tuple):
-                inputs = [input.numpy() for input in inputs]
-            else:
-                inputs = inputs.numpy()
-
-            if isinstance(labels, dict) or isinstance(labels, OrderedDict) \
-                  or isinstance(labels, UserDict):   # pragma: no cover
-                for name in labels.keys():
-                    labels[name] = labels[name].numpy()
-            elif isinstance(labels, list) or isinstance(labels, tuple):
-                labels = [label.numpy() for label in labels]
-            else:
-                labels = labels.numpy()
-            yield inputs, labels
-
-    def __len__(self):
-        """Return the length of dataset."""
-        return len(self.dataset)
-
-
-def distributed_init(worker_addresses, type='worker', index=0):
-    """Init distribute environment.
-
-    Args:
-        worker_addresses: Addresses of all nodes.
-        type: The type of node, such as worker.
-        index: When index is 0, the node treat as a chief.
-    """
-    tf_config = {
-        'cluster': {
-            'worker': worker_addresses
-        },
-        'task': {'type': type, 'index': index}
-    }
-    os.environ['TF_CONFIG'] = json.dumps(tf_config)
-
-def _is_chief(task_type, task_id):
-    # here only consider the case in which TF_CONFIG task_type is set as worker
-    # and task_id=0 represents the chief
-    return (task_type == 'worker' and task_id == 0)
-
-# get model folder path for the distributed environment
-def get_filepath(base_dirpath, task_type, task_id):
-    """Get model folder path for the distributed environment.
-
-    Args:
-        base_dirpath: The basic folder path.
-        task_type: Task_type is set as worker.
-        task_id: Task id. When task_id=0, the node treat as a chief.
-    """
-    if task_type is None:    # single node
-        return base_dirpath
-    elif _is_chief(task_type, task_id):
-        return os.path.join(base_dirpath, 'chief')
-    else:
-        return os.path.join(base_dirpath, 'worker_' + str(task_id))
-
-
-# convert a Keras model to SavedModel
-def keras2SavedModel(model):   # pragma: no cover
-    """Transfer keras model into save_model."""
-    model = common.Model(model)
-    return model.model

From 86087dc4a1d0ed74c1360c7906cd4eae9a59704e Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Tue, 2 Jul 2024 17:29:30 +0800
Subject: [PATCH 9/9] Set lm-eval to 0.4.2 (#1647)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .github/workflows/script/formatScan/pylint.sh                   | 2 +-
 .../pytorch/language-modeling/inference/requirements.txt        | 2 +-
 .../pytorch/language-modeling/pruning/requirements.txt          | 2 +-
 .../pytorch/language-modeling/quantization/requirements.txt     | 2 +-
 .../huggingface/pytorch/text2text-generation/requirements.txt   | 2 +-
 examples/modelscope/requirements.txt                            | 2 +-
 .../neural_chat/requirements_cpu.txt                            | 2 +-
 .../neural_chat/requirements_hpu.txt                            | 2 +-
 .../neural_chat/requirements_win.txt                            | 2 +-
 .../neural_chat/tests/requirements.txt                          | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
index eeb71beb604..41e22c470b9 100644
--- a/.github/workflows/script/formatScan/pylint.sh
+++ b/.github/workflows/script/formatScan/pylint.sh
@@ -28,7 +28,7 @@ else
     echo "Not found requirements.txt file."
 fi
 # install packages
-pip install lm-eval
+pip install lm-eval==0.4.2
 pip install accelerate nlpaug nltk schema optimum-intel optimum peft
 pip install --upgrade --force-reinstall transformers==4.36.2
 pip install optimum-habana
diff --git a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
index e87bc861ca8..cd6cd604899 100644
--- a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
@@ -1,4 +1,4 @@
 transformers 
 accelerate
 sentencepiece != 0.1.92
-lm-eval
+lm-eval==0.4.2
diff --git a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
index b60bac56d76..a1ea63132a8 100644
--- a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
@@ -7,5 +7,5 @@ transformers
 torch==2.0.1
 tqdm
 neural_compressor
-lm-eval
+lm-eval==0.4.2
 
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
index c7b5b6fcf83..36ee5a1b55a 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
+++ b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
@@ -9,5 +9,5 @@ wandb
 einops
 neural-compressor
 pytest==8.0.0
-lm-eval
+lm-eval==0.4.2
 git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a
diff --git a/examples/huggingface/pytorch/text2text-generation/requirements.txt b/examples/huggingface/pytorch/text2text-generation/requirements.txt
index 8a585f9fd9e..73e4ae2e655 100644
--- a/examples/huggingface/pytorch/text2text-generation/requirements.txt
+++ b/examples/huggingface/pytorch/text2text-generation/requirements.txt
@@ -11,4 +11,4 @@ neural-compressor
 optimum-intel > 1.12.0
 onnxruntime
 intel-extension-for-pytorch
-lm-eval
+lm-eval==0.4.2
diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt
index bc7a3e65de6..b04bd189db0 100644
--- a/examples/modelscope/requirements.txt
+++ b/examples/modelscope/requirements.txt
@@ -1,6 +1,6 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval
+lm-eval==0.4.2
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
index 6097d2e2a0d..7b38113697b 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
@@ -7,7 +7,7 @@ fastapi
 fschat==0.2.32
 huggingface_hub
 intel_extension_for_pytorch==2.3.0
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 neural_speed==1.0a0
 numpy==1.23.5
diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
index 1c6dfa0d47a..f3983b6d3c5 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
@@ -4,7 +4,7 @@ evaluate
 fastapi
 fschat==0.2.35
 huggingface_hub
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/requirements_win.txt b/intel_extension_for_transformers/neural_chat/requirements_win.txt
index c417c5ca01a..56ac6027ab4 100644
--- a/intel_extension_for_transformers/neural_chat/requirements_win.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements_win.txt
@@ -6,7 +6,7 @@ fastapi
 fschat==0.2.35
 huggingface_hub
 intel-extension-for-transformers
-lm-eval
+lm-eval==0.4.2
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
index a4243865087..97a46d2e502 100644
--- a/intel_extension_for_transformers/neural_chat/tests/requirements.txt
+++ b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -38,7 +38,7 @@ langchain-community==0.0.27
 langchain_core==0.1.35
 langid
 librosa
-lm-eval
+lm-eval==0.4.2
 markdown
 neural-compressor
 neural_speed==1.0a0