diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh index eeb71beb604..41e22c470b9 100644 --- a/.github/workflows/script/formatScan/pylint.sh +++ b/.github/workflows/script/formatScan/pylint.sh @@ -28,7 +28,7 @@ else echo "Not found requirements.txt file." fi # install packages -pip install lm-eval +pip install lm-eval==0.4.2 pip install accelerate nlpaug nltk schema optimum-intel optimum peft pip install --upgrade --force-reinstall transformers==4.36.2 pip install optimum-habana diff --git a/.github/workflows/script/install_binary.sh b/.github/workflows/script/install_binary.sh index bbd6b7df2f1..7bca0d4d2f3 100644 --- a/.github/workflows/script/install_binary.sh +++ b/.github/workflows/script/install_binary.sh @@ -4,6 +4,7 @@ source /intel-extension-for-transformers/.github/workflows/script/change_color.s cd /intel-extension-for-transformers export CMAKE_ARGS="-DNE_DNNL_CACHE_DIR=/cache" pip install -U pip +pip install -r requirements.txt $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET git config --global --add safe.directory "*" git submodule update --init --recursive diff --git a/docs/api_doc/optimization/config.rst b/docs/api_doc/optimization/config.rst index 435f5d6ddad..2ca607c03dc 100644 --- a/docs/api_doc/optimization/config.rst +++ b/docs/api_doc/optimization/config.rst @@ -4,7 +4,4 @@ Config .. autoapisummary:: intel_extension_for_transformers.transformers.utils.metrics intel_extension_for_transformers.transformers.utils.objectives - intel_extension_for_transformers.transformers.config - intel_extension_for_transformers.transformers.quantization - intel_extension_for_transformers.transformers.distillation - intel_extension_for_transformers.transformers.pruning + intel_extension_for_transformers.transformers.utils.config diff --git a/docs/api_doc/optimization/tf_optimization.rst b/docs/api_doc/optimization/tf_optimization.rst deleted file mode 100644 index 3aa7cb7864a..00000000000 --- a/docs/api_doc/optimization/tf_optimization.rst +++ /dev/null @@ -1,6 +0,0 @@ -TensorFlow Optimizer -============== - -.. autoapisummary:: - - intel_extension_for_transformers.transformers.optimizer_tf diff --git a/docs/api_doc/user_api.rst b/docs/api_doc/user_api.rst index 712132f5d55..80a7ead6078 100644 --- a/docs/api_doc/user_api.rst +++ b/docs/api_doc/user_api.rst @@ -7,7 +7,5 @@ The following Python API information is available: :maxdepth: 1 optimization/trainer.rst - optimization/optimizer.rst optimization/model.rst - optimization/tf_optimization.rst optimization/config.rst diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json index 1c1573ceb41..791dc367033 100644 --- a/examples/.config/pytorch_optimize.json +++ b/examples/.config/pytorch_optimize.json @@ -2450,6 +2450,58 @@ } } }, + "phi_2b_gen_ipex_static": { + "working_dir": "huggingface/pytorch/text-generation/quantization", + "tune": { + "cmd": "bash run_tuning.sh", + "params": { + "topology": "phi_2b", + "task": "generation", + "approach": "static", + "output_model": "saved_results" + } + }, + "benchmark": { + "cmd": "bash run_benchmark.sh", + "params": { + "topology": "phi_2b", + "task": "generation", + "approach": "static", + "backend": "ipex", + "mode": "benchmark", + "batch_size": "112", + "iters": "100", + "int8": "false", + "config": "saved_results" + } + } + }, + "phi_3b_gen_ipex_static": { + "working_dir": "huggingface/pytorch/text-generation/quantization", + "tune": { + "cmd": "bash run_tuning.sh", + "params": { + "topology": "phi_3b", + "task": "generation", + "approach": "static", + "output_model": "saved_results" + } + }, + "benchmark": { + "cmd": "bash run_benchmark.sh", + "params": { + "topology": "phi_3b", + "task": "generation", + "approach": "static", + "backend": "ipex", + "mode": "benchmark", + "batch_size": "112", + "iters": "100", + "int8": "false", + "config": "saved_results" + } + } + }, "flan-t5-large_gen_ipex_static": { "working_dir": "huggingface/pytorch/text2text-generation", "tune": { diff --git a/examples/.config/tensorflow_optimize.json b/examples/.config/tensorflow_optimize.json deleted file mode 100644 index ab0aacaf6ce..00000000000 --- a/examples/.config/tensorflow_optimize.json +++ /dev/null @@ -1,255 +0,0 @@ -{ - "bert_base_mrpc_static": { - "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "bert_base_mrpc_static", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "bert_base_mrpc_static", - "mode": "accuracy", - "batch_size": "64", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - } - }, - "distilgpt2_clm": { - "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "distilgpt2_clm", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "distilgpt2_clm", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - } - }, - "distilbert_mlm": { - "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "distilbert_mlm", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "distilbert_mlm", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - } - }, - "bert_base_ner": { - "working_dir": "huggingface/tensorflow/token-classification/quantization", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "bert_base_ner", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "bert_base_ner", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/token-classification" - } - } - }, - "distilbert_qa": { - "working_dir": "huggingface/tensorflow/question-answering/quantization", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "distilbert_qa", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "distilbert_qa", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering" - } - } - }, - "distilbert_swag": { - "working_dir": "huggingface/tensorflow/multiple-choice/quantization", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "distilbert_swag", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "distilbert_swag", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/multiple-choice" - } - } - }, - "roberta_qa": { - "working_dir": "huggingface/tensorflow/question-answering/quantization", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "roberta_qa", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "roberta_qa", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/question-answering" - } - } - }, - "distilroberta_mlm": { - "working_dir": "huggingface/tensorflow/language-modeling/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "distilroberta_mlm", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "distilroberta_mlm", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/language-modeling" - } - } - }, - "legalbert_mrpc": { - "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "legalbert_mrpc", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "legalbert_mrpc", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - } - }, - "xlnet_mrpc": { - "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "xlnet_mrpc", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "xlnet_mrpc", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - } - }, - "albert_large_mrpc": { - "working_dir": "huggingface/tensorflow/text-classification/quantization/ptq", - "tune":{ - "cmd": "bash run_tuning.sh", - "params": { - "topology": "albert_large_mrpc", - "output_model": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - }, - "benchmark": { - "cmd": "bash run_benchmark.sh", - "params": { - "topology": "albert_large_mrpc", - "mode": "accuracy", - "batch_size": "16", - "iters": "100", - "int8": "false", - "config": "saved_results", - "cache_dir": "${HOME}/.cache/nlp_toolkit/text-classification" - } - } - } -} diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt index 62d596fa100..3f7fca6d65d 100644 --- a/examples/huggingface/neural_speed/requirements.txt +++ b/examples/huggingface/neural_speed/requirements.txt @@ -1,12 +1,11 @@ intel_extension_for_transformers neural-speed -lm-eval +lm-eval==0.4.2 sentencepiece gguf --extra-index-url https://download.pytorch.org/whl/cpu torch==2.3.0+cpu transformers -intel_extension_for_pytorch==2.3.0 tiktoken transformers_stream_generator -zipfile38 \ No newline at end of file +zipfile38 diff --git a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt index e87bc861ca8..cd6cd604899 100644 --- a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt @@ -1,4 +1,4 @@ transformers accelerate sentencepiece != 0.1.92 -lm-eval +lm-eval==0.4.2 diff --git a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt index b60bac56d76..a1ea63132a8 100644 --- a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt @@ -7,5 +7,5 @@ transformers torch==2.0.1 tqdm neural_compressor -lm-eval +lm-eval==0.4.2 diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt index c7b5b6fcf83..36ee5a1b55a 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt +++ b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt @@ -9,5 +9,5 @@ wandb einops neural-compressor pytest==8.0.0 -lm-eval +lm-eval==0.4.2 git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a diff --git a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt index 6cf73c3deae..5d46e354048 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/emotion/distilbert_base_uncased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py index 9374620302a..f3645880317 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base_cased/run_glue.py @@ -468,7 +468,7 @@ def preprocess_function(examples): # Get the metric function if data_args.task_name is not None: - metric = load_metric("glue", data_args.task_name) + metric = load_metric("glue", data_args.task_name,trust_remote_code=True) else: metric = load_metric("accuracy") diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_mini/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/distilbert_base_uncased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/roberta_base/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/bert_mini/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/sparse/distilbert_base_uncased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/bert_mini/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/distilbert_base_uncased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt index 94b03297a4a..3e3e2e9d604 100644 --- a/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt +++ b/examples/huggingface/pytorch/text-classification/deployment/sst2/minilm_l6_h384_uncased/requirements.txt @@ -1,4 +1,5 @@ -neural-compressor +intel-extension-for-transformers==1.4.2 +neural-compressor==2.6 transformers accelerate datasets >= 1.8.0 diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index bf77c9ece9a..1ed8c54b1ce 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -168,13 +168,18 @@ function run_benchmark { model_name_or_path="Intel/neural-chat-7b-v3" script="run_generation_sq.py" elif [ "${topology}" = "phi_1b" ]; then - model_name_or_path="susnato/phi-1_dev" - pip install transformers==4.36.1 + model_name_or_path="microsoft/phi-1" script="run_generation_sq.py" elif [ "${topology}" = "phi_1_5b" ]; then - model_name_or_path="susnato/phi-1_5_dev" - pip install transformers==4.36.1 + model_name_or_path="microsoft/phi-1_5" script="run_generation_sq.py" + elif [ "${topology}" = "phi_2b" ]; then + model_name_or_path="microsoft/phi-2" + script="run_generation_sq.py" + elif [ "${topology}" = "phi_3b" ]; then + model_name_or_path="microsoft/Phi-3-mini-4k-instruct" + script="run_generation_sq.py" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "llama2_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf" script="run_generation_cpu_woq.py" diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index db3a06767c9..c89f84e8f59 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -216,14 +216,25 @@ function run_tuning { script="run_generation_sq.py" elif [ "${topology}" = "phi_1b" ]; then alpha=0.5 - model_name_or_path="susnato/phi-1_dev" + model_name_or_path="microsoft/phi-1" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code" script="run_generation_sq.py" elif [ "${topology}" = "phi_1_5b" ]; then alpha=0.5 - model_name_or_path="susnato/phi-1_5_dev" + model_name_or_path="microsoft/phi-1_5" + extra_cmd=$extra_cmd" --sq --alpha ${alpha}" + extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" + script="run_generation_sq.py" + elif [ "${topology}" = "phi_2b" ]; then + alpha=0.5 + model_name_or_path="microsoft/phi-2" + extra_cmd=$extra_cmd" --sq --alpha ${alpha}" + extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" + script="run_generation_sq.py" + elif [ "${topology}" = "phi_3b" ]; then + alpha=0.5 + model_name_or_path="microsoft/Phi-3-mini-4k-instruct" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code" diff --git a/examples/huggingface/pytorch/text2text-generation/requirements.txt b/examples/huggingface/pytorch/text2text-generation/requirements.txt index 8a585f9fd9e..73e4ae2e655 100644 --- a/examples/huggingface/pytorch/text2text-generation/requirements.txt +++ b/examples/huggingface/pytorch/text2text-generation/requirements.txt @@ -11,4 +11,4 @@ neural-compressor optimum-intel > 1.12.0 onnxruntime intel-extension-for-pytorch -lm-eval +lm-eval==0.4.2 diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt index bc7a3e65de6..b04bd189db0 100644 --- a/examples/modelscope/requirements.txt +++ b/examples/modelscope/requirements.txt @@ -1,6 +1,6 @@ intel_extension_for_transformers neural-speed -lm-eval +lm-eval==0.4.2 sentencepiece gguf --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt index 6097d2e2a0d..7b38113697b 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt @@ -7,7 +7,7 @@ fastapi fschat==0.2.32 huggingface_hub intel_extension_for_pytorch==2.3.0 -lm-eval +lm-eval==0.4.2 neural-compressor neural_speed==1.0a0 numpy==1.23.5 diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt index 1c6dfa0d47a..f3983b6d3c5 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt @@ -4,7 +4,7 @@ evaluate fastapi fschat==0.2.35 huggingface_hub -lm-eval +lm-eval==0.4.2 neural-compressor numpy==1.23.5 optimum diff --git a/intel_extension_for_transformers/neural_chat/requirements_win.txt b/intel_extension_for_transformers/neural_chat/requirements_win.txt index c417c5ca01a..56ac6027ab4 100644 --- a/intel_extension_for_transformers/neural_chat/requirements_win.txt +++ b/intel_extension_for_transformers/neural_chat/requirements_win.txt @@ -6,7 +6,7 @@ fastapi fschat==0.2.35 huggingface_hub intel-extension-for-transformers -lm-eval +lm-eval==0.4.2 neural-compressor numpy==1.23.5 optimum diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt index a4243865087..97a46d2e502 100644 --- a/intel_extension_for_transformers/neural_chat/tests/requirements.txt +++ b/intel_extension_for_transformers/neural_chat/tests/requirements.txt @@ -38,7 +38,7 @@ langchain-community==0.0.27 langchain_core==0.1.35 langid librosa -lm-eval +lm-eval==0.4.2 markdown neural-compressor neural_speed==1.0a0 diff --git a/intel_extension_for_transformers/qbits/__init__.py b/intel_extension_for_transformers/qbits/__init__.py index c23599090dc..5cd39d26a7f 100644 --- a/intel_extension_for_transformers/qbits/__init__.py +++ b/intel_extension_for_transformers/qbits/__init__.py @@ -16,5 +16,6 @@ # limitations under the License. import torch -if not torch.xpu._is_compiled(): - from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611 +import intel_extension_for_transformers +if "gpu" not in intel_extension_for_transformers.__version__: + from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611 diff --git a/intel_extension_for_transformers/transformers/config.py b/intel_extension_for_transformers/transformers/config.py index a0009e7d3ed..f5918267491 100644 --- a/intel_extension_for_transformers/transformers/config.py +++ b/intel_extension_for_transformers/transformers/config.py @@ -19,7 +19,7 @@ import yaml from enum import Enum -from neural_compressor.conf.dotdict import DotDict +from neural_compressor.utils.utility import DotDict from .utils.metrics import Metric from .utils.objectives import Objective, performance diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 7092d85452f..f90203d5ee4 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -935,9 +935,263 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: or device_map == torch.device("cpu") ) and model.config.model_type == "chatglm": model = model.float() + if ( + not torch.cuda.is_available() + or device_map == "cpu" + or device_map == torch.device("cpu") + ) and model.config.model_type == "mpt": + model.config.architectures = ["MptForCausalLM"] model.eval() logger.info("Applying SmoothQuant.") +<<<<<<< HEAD model = convert_to_smoothquant_model(model, quantization_config) +======= + # ipex.optimize_transformers + if quantization_config.ipex_opt_llm is None: + if model_type in IPEX_OPT_LLM_SUPPORTED: + quantization_config.ipex_opt_llm = True + logger.info( + "quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used." + ) + logger.warning("The suggested transformers version is 4.38.1.") + else: + quantization_config.ipex_opt_llm = False + if quantization_config.ipex_opt_llm: + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5) + model = ipex.optimize_transformers( + model.eval(), + quantization_config=qconfig, + dtype=torch.float32, + inplace=True, + deployment_mode=False, + ) + model.eval() + + # past_key_values + num_beams = quantization_config.num_beams + if quantization_config.ipex_opt_llm: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=model.config, input_bs=1, num_beams=num_beams + ) + else: + past_key_values = generate_dummy_past_key_values( + config=model.config, input_bs=1 + ) + + # calibration function + calib_func = quantization_config.calib_func + tokenizer = quantization_config.tokenizer + if calib_func is None: + if quantization_config.tokenizer is None: + logger.error( + "Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + ) + exit(0) + + from datasets import load_dataset + from torch.utils.data import DataLoader + + calib_dataset = quantization_config.calib_dataset + calib_shuffle = quantization_config.calib_shuffle + calib_iters = quantization_config.calib_iters + calib_padding = quantization_config.calib_padding + calib_len = quantization_config.calib_len + calib_pad_val = quantization_config.calib_pad_val + from torch.nn.functional import pad + + calib_dataset = load_dataset( + calib_dataset, + split=( + "test" + if calib_dataset in ["mbpp", "openai_humaneval"] + else "train" + ), + ) + if calib_shuffle: + calib_dataset = calib_dataset.shuffle(seed=42) + + def tokenize_function(examples): + if "code" in examples: + example = tokenizer(examples["code"]) + elif "prompt" in examples: + example = tokenizer(examples["prompt"]) + elif "text" in examples: + example = tokenizer(examples["text"]) + else: + logger.error( + "Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset." + ) + exit(0) + return example + + def collate_batch(batch): + position_ids_padded = [] + input_ids_padded = [] + last_ind = [] + attention_mask_padded = [] + for text in batch: + input_ids = text["input_ids"] + if not calib_padding: + input_ids = ( + input_ids[: int(calib_len)] + if len(input_ids) > int(calib_len) + else input_ids + ) # no_padding + else: + pad_len = calib_len - input_ids.shape[0] + input_ids = pad( + input_ids, (0, pad_len), value=calib_pad_val + ) + + last_ind.append(input_ids.shape[0] - 1) + attention_mask = torch.ones(len(input_ids)) + position_ids = torch.arange(len(input_ids)) + input_ids_padded.append(input_ids) + attention_mask_padded.append(attention_mask) + position_ids_padded.append(position_ids) + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + return ( + { + "input_ids": torch.vstack(input_ids_padded), + "attention_mask": torch.vstack(attention_mask_padded), + "position_ids": torch.vstack(position_ids_padded), + "past_key_values": past_key_values, + }, + torch.tensor(last_ind), + ) + else: + return ( + { + "input_ids": torch.vstack(input_ids_padded), + "attention_mask": torch.vstack(attention_mask_padded), + "past_key_values": past_key_values, + }, + torch.tensor(last_ind), + ) + + def collate_batch_for_chatglm(batch): + last_ind = [] + for text in batch: + input_ids = torch.vstack([text["input_ids"]]) + if re.search( + "THUDM/chatglm-6b", model.config.auto_map["AutoConfig"] + ): + input_ids = ( + input_ids[:, :calib_len] + if input_ids.shape[1] > calib_len + else input_ids + ) + eos = torch.tensor([130001, 130004]).repeat(1, 1) + input_ids = torch.cat((input_ids, eos), 1) + else: + input_ids = ( + input_ids[:, :calib_len] + if input_ids.shape[1] > calib_len + else input_ids + ) + prepared_inputs = model.prepare_inputs_for_generation(input_ids) + attention_mask = torch.ones_like(input_ids) + last_ind.append(input_ids.shape[1] - 1) + return ( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": prepared_inputs["position_ids"], + "past_key_values": past_key_values, + }, + torch.tensor(last_ind), + ) + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + if model_type == "chatglm": + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch_for_chatglm, + ) + else: + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) + + def calib_func(model): + with torch.no_grad(): + for i, (inputs, last_ind) in enumerate(calib_dataloader): + if i >= calib_iters: + break + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + model( + input_ids=inputs["input_ids"], + past_key_values=inputs["past_key_values"], + position_ids=inputs["position_ids"], + attention_mask=inputs["attention_mask"], + ) + else: + model( + input_ids=inputs["input_ids"], + past_key_values=inputs["past_key_values"], + attention_mask=inputs["attention_mask"], + ) + + logger.info( + "The default calibration function is used, " + + "the calibration dataset is NeelNanda/pile-10k, " + + "batchsize is 1 and calibration iteration is 100." + ) + calib_func = calib_func + + # example_inputs + example_inputs = quantization_config.example_inputs + if example_inputs is None: + for i, (inputs, last_ind) in enumerate(calib_dataloader): + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + example_inputs = { + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "position_ids": inputs["position_ids"], + "past_key_values": inputs["past_key_values"], + } + else: + example_inputs = { + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "past_key_values": inputs["past_key_values"], + } + break + + # call inc sq + from neural_compressor import PostTrainingQuantConfig, quantization + + conf = PostTrainingQuantConfig( + backend=quantization_config.backend, # default is ipex + excluded_precisions=quantization_config.excluded_precisions, + op_type_dict=quantization_config.op_type_dict, + op_name_dict=quantization_config.op_name_dict, + recipes=quantization_config.recipes, + example_inputs=example_inputs, + ) + + model = quantization.fit( + model, + conf, + calib_func=calib_func, + calib_dataloader=( + calib_dataloader + if quantization_config.recipes["smooth_quant_args"]["alpha"] + == "auto" + else None + ), + ) +>>>>>>> main logger.info("SmoothQuant done.") elif isinstance(quantization_config, DynamicQuantConfig): model = cls.ORIG_MODEL.from_pretrained( diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 527d8b097ff..5f353b296ed 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -95,3 +95,410 @@ def __init__(self) -> None: self.dataset = dataloader.dataset return INCDataLoader() +<<<<<<< HEAD +======= + + +def generate_dummy_past_key_values(config, input_bs): + """Generate the dummy past_key_values.""" + from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": + new_shape = [ + input_bs, + 0, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "baichuan": + new_shape = [ + input_bs, + config.num_attention_heads, + 0, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "chatglm": + new_shape = [ + 0, + input_bs, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_layers + else: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + + if config.model_type == "bloom": + shape_key = (input_bs * num_attention_heads, d_k, 1) + shape_value = (input_bs * num_attention_heads, 1, d_k) + key = torch.ones(size=shape_key) + value = torch.ones(size=shape_value) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) + for _ in range(num_layers) + ) + return past_key_values + elif config.model_type == "gpt_bigcode": + new_shape = [input_bs, 0, d_k * 2] + dummy_tensor = torch.zeros(size=new_shape) + past_key_values = tuple([dummy_tensor] * num_layers) + return past_key_values + elif config.model_type == "falcon": + new_shape = [input_bs, 1, 0, d_k] + else: + new_shape = [input_bs, num_key_value_heads, 0, d_k] + past_key_values = [ + ( + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +def generate_dummy_past_key_values_for_inference(config, input_bs): + """Generate the dummy past_key_values.""" + from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": + new_shape = [ + input_bs, + 0, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "baichuan": + new_shape = [ + input_bs, + config.num_attention_heads, + 0, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "chatglm": + new_shape = [ + 0, + input_bs, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_layers + else: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + + if config.model_type == "bloom": + shape_key = (input_bs * num_attention_heads, d_k, 0) + shape_value = (input_bs * num_attention_heads, 0, d_k) + key = torch.empty(size=shape_key) + value = torch.empty(size=shape_value) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) + for _ in range(num_layers) + ) + return past_key_values + elif config.model_type == "gpt_bigcode": + new_shape = [input_bs, 0, d_k * 2] + dummy_tensor = torch.zeros(size=new_shape) + past_key_values = tuple([dummy_tensor] * num_layers) + return past_key_values + elif config.model_type == "falcon": + new_shape = [input_bs, 1, 0, d_k] + else: + new_shape = [input_bs, num_key_value_heads, 0, d_k] + past_key_values = [ + ( + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): + """Generate the dummy past_key_values.""" + from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": + new_shape = [ + input_bs, + 1, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "baichuan": + new_shape = [ + input_bs, + config.num_attention_heads, + 1, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_hidden_layers + elif config.model_type == "chatglm": + new_shape = [ + 1, + input_bs, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] + num_layers = config.num_layers + else: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + nb_pkv = 2 + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + if config.model_type == "bloom": + for nb_pkv in range(nb_pkv): + if nb_pkv % 2 == 0: + new_shape = [input_bs * num_key_value_heads, d_k, 1] + else: + new_shape = [input_bs * num_key_value_heads, 1, d_k] + + else: + new_shape = [input_bs, num_key_value_heads, 1, d_k] + + beam_idx_tmp = torch.zeros( + (2048, int(input_bs * num_beams)), dtype=torch.long + ).contiguous() + past_key_values = [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + beam_idx_tmp, + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +IPEX_OPT_LLM_SUPPORTED_DICT = { + "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], + "2.3": [ + "gptj", + "opt", + "llama", + "falcon", + "chatglm", + "baichuan", + "qwen", + "bloom", + "codegen", + "gptbigcode", + "t5", + "mixtral", + "mpt", + ], +} + +MODEL_TYPES_REQUIRING_POSITION_IDS = { + "codegen", + "gpt2", + "gpt-bigcode", + "gpt-neo", + "gpt-neox", + "gptj", + "imagegpt", + "llama", + "mistral", + "chatglm", +} + +if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": + logger.info( + "ipex.llm.optimize by 2.2.0 version supported model family: {}".format( + ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]) + ) + ) + logger.info( + "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version." + ) + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] +elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": + logger.info( + "ipex.llm.optimize by 2.3.0 version supported model family: {}".format( + ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]) + ) + ) + logger.info( + "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version." + ) + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] +else: + logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] + +def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): + """Generate the dummy example inputs.""" + prompt = "Welcome to use Intel Extension for Transformers." + prompt = [prompt] * batch_size + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + model_type = model_config.model_type.replace("_", "-") + if model_type in IPEX_OPT_LLM_SUPPORTED: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=model_config, + input_bs=batch_size, + num_beams=num_beams + ) + else: + past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size) + + input_ids = input_ids[:, :512] + attention_mask = torch.ones(input_ids.shape) + position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1) + + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + example_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values + } + else: + example_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values + } + return example_inputs + + +def make_torchscript_model(model, json_file_path, example_inputs): + """Recover ipex model from JSON file. + + Args: + model (object): fp32 model need to do quantization. + json_file_path (json): configuration JSON file for ipex. + example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function. + + Returns: + (object): quantized model + """ + + ipex = LazyImport("intel_extension_for_pytorch") + from torch.ao.quantization.observer import MinMaxObserver + + if ipex.__version__ >= "2.1.100": + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver) + else: + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver()) + if isinstance(example_inputs, dict): + model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True) + else: + model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True) + model.load_qconf_summary(qconf_summary=json_file_path) + model = ipex.quantization.convert(model, inplace=True) + model.eval() + with torch.no_grad(): + try: + if isinstance(example_inputs, dict): + # pylint: disable=E1120,E1123 + model = torch.jit.trace(model, example_kwarg_inputs=example_inputs) + else: + model = torch.jit.trace(model, example_inputs) + model = torch.jit.freeze(model.eval()) + except: + if isinstance(example_inputs, dict): + # pylint: disable=E1120,E1123 + model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False) + else: + model = torch.jit.trace(model, example_inputs, strict=False) + model = torch.jit.freeze(model.eval()) + if isinstance(example_inputs, dict): + model(**example_inputs) + model(**example_inputs) + elif isinstance(example_inputs, tuple) or isinstance(example_inputs, list): + model(*example_inputs) + model(*example_inputs) + else: + model(example_inputs) + model(example_inputs) + return model + +def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False): + """Recover ipex model from JSON file. + + Args: + model (object): fp32 model need to do quantization. + json_file_path (json): configuration JSON file for ipex. + trust_remote_code (bool): trust remote code. + + Returns: + (object): quantized model + """ + from transformers import AutoModelForCausalLM + + # ipex recovered int8 model from configure.json requests float32 model input and on cpu device. + user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, + trust_remote_code=trust_remote_code).float() + if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED: + import intel_extension_for_pytorch as ipex + qconfig = ipex.quantization.default_static_qconfig_mapping + user_model = ipex.optimize_transformers( + user_model.eval(), + dtype=torch.float, + inplace=True, + quantization_config=qconfig, + deployment_mode=False, + ) + + # tokenizer + if user_model.config.model_type == "llama": + from transformers import LlamaTokenizer + tokenizer = LlamaTokenizer.from_pretrained(user_model.config.name_or_path) + else: + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( + user_model.config.name_or_path, trust_remote_code=trust_remote_code + ) + + # example_inputs + example_inputs = get_example_inputs(user_model.config, tokenizer=tokenizer) + + # pylint: disable=E0611 + user_model.config.torchscript = True + config = user_model.config + user_model = make_torchscript_model(user_model, json_file_path, example_inputs) + import intel_extension_for_pytorch as ipex + from intel_extension_for_transformers.transformers.llm.evaluation.models import ( + TSModelCausalLMForITREX, + ) + origin_model_type = config.model_type + if origin_model_type in ["chatglm", "qwen", "baichuan"]: + config.model_type = "qwen2" + user_model = TSModelCausalLMForITREX(user_model, config=config) + user_model.config.model_type = origin_model_type + return user_model +>>>>>>> main diff --git a/intel_extension_for_transformers/transformers/utils/utility_tf.py b/intel_extension_for_transformers/transformers/utils/utility_tf.py deleted file mode 100644 index f19785740af..00000000000 --- a/intel_extension_for_transformers/transformers/utils/utility_tf.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utils for tensorflow framework.""" - -import os -import json -from collections import OrderedDict, UserDict -from neural_compressor.experimental import common - -TMPPATH = os.path.join('tmp', 'model') -TEACHERPATH = os.path.join('tmp', 'teacher_model') -class TFDataloader(object): - """Tensorflow dataloader. - - Args: - dataset (string): Dataset - """ - - def __init__(self, dataset, batch_size=None): - """Init an instance.""" - self.dataset = dataset - self.batch_size = batch_size - - def __iter__(self): - """Get the iteration of dataset.""" - for inputs, labels in self.dataset: - if isinstance(inputs, dict) or isinstance(inputs, OrderedDict) \ - or isinstance(inputs, UserDict): - for name in inputs.keys(): - inputs[name] = inputs[name].numpy() - elif isinstance(inputs, list) or isinstance(inputs, tuple): - inputs = [input.numpy() for input in inputs] - else: - inputs = inputs.numpy() - - if isinstance(labels, dict) or isinstance(labels, OrderedDict) \ - or isinstance(labels, UserDict): # pragma: no cover - for name in labels.keys(): - labels[name] = labels[name].numpy() - elif isinstance(labels, list) or isinstance(labels, tuple): - labels = [label.numpy() for label in labels] - else: - labels = labels.numpy() - yield inputs, labels - - def __len__(self): - """Return the length of dataset.""" - return len(self.dataset) - - -def distributed_init(worker_addresses, type='worker', index=0): - """Init distribute environment. - - Args: - worker_addresses: Addresses of all nodes. - type: The type of node, such as worker. - index: When index is 0, the node treat as a chief. - """ - tf_config = { - 'cluster': { - 'worker': worker_addresses - }, - 'task': {'type': type, 'index': index} - } - os.environ['TF_CONFIG'] = json.dumps(tf_config) - -def _is_chief(task_type, task_id): - # here only consider the case in which TF_CONFIG task_type is set as worker - # and task_id=0 represents the chief - return (task_type == 'worker' and task_id == 0) - -# get model folder path for the distributed environment -def get_filepath(base_dirpath, task_type, task_id): - """Get model folder path for the distributed environment. - - Args: - base_dirpath: The basic folder path. - task_type: Task_type is set as worker. - task_id: Task id. When task_id=0, the node treat as a chief. - """ - if task_type is None: # single node - return base_dirpath - elif _is_chief(task_type, task_id): - return os.path.join(base_dirpath, 'chief') - else: - return os.path.join(base_dirpath, 'worker_' + str(task_id)) - - -# convert a Keras model to SavedModel -def keras2SavedModel(model): # pragma: no cover - """Transfer keras model into save_model.""" - model = common.Model(model) - return model.model diff --git a/setup.py b/setup.py index 17700afeeb3..13aec7b7025 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,12 @@ from pathlib import Path from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext +from setuptools_scm import get_version result = subprocess.Popen("pip install -r requirements.txt", shell=True) result.wait() + def is_intel_gpu_available(): import torch import intel_extension_for_pytorch as ipex @@ -286,6 +288,9 @@ def check_submodules(): "intel_extension_for_transformers/transformers/runtime/"), ]) cmdclass = {'build_ext': CMakeBuild} + itrex_version = get_version() + if IS_INTEL_GPU: + itrex_version = itrex_version + "-gpu" setup( name="intel-extension-for-transformers", @@ -324,4 +329,5 @@ def check_submodules(): ], setup_requires=['setuptools_scm'], use_scm_version=True, + version=itrex_version )