Merge branch 'main' into wangchang/qwen

intel · Oct 23, 2023 · 2e77b6b · 2e77b6b
2 parents 572ecbf + 606cbeb
commit 2e77b6b
Show file tree

Hide file tree

Showing 16 changed files with 90 additions and 181 deletions.
diff --git a/.github/workflows/chatbot-finetune-mpt-7b-chat.yml b/.github/workflows/chatbot-finetune-mpt-7b-chat.yml
@@ -23,9 +23,9 @@ jobs:
 
       - name: Build Docker Image
         run: 
-          ITREX_VER=${{ github.sha }} || true
-
-          docker build --no-cache ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotfinetune-mpi:latest && yes | docker container prune && yes | docker image prune
+          if [[ $(docker images | grep chatbotfinetune-mpi | wc -l) == 0 ]]; then
+            docker build ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotfinetune-mpi:latest && yes | docker container prune && yes | docker image prune;
+          fi
 
       - name: Start Docker Container on socket 0
         id: master_container
@@ -48,6 +48,8 @@ jobs:
       - name: Run Finetuning
         run: |
           sh .github/workflows/script/chatbot/prepare_ft_mpt-7b-chat_mpi.sh ${{ steps.master_container.outputs.master_node }} ${{ steps.slave_container.outputs.slave_node }}
+          docker exec "chatbotfinetune-mpi-s0" bash -c "cd /root/chatbot && source activate && conda activate neuralchat && pip uninstall intel-extension-for-transformers -y && python setup.py install"
+          docker exec "chatbotfinetune-mpi-s1" bash -c "cd /root/chatbot && source activate && conda activate neuralchat && pip uninstall intel-extension-for-transformers -y && python setup.py install"
           docker exec "chatbotfinetune-mpi-s0" bash -c "cd /root/chatbot; source ./bash_setup.sh; mpirun -f ./hosts2 -n 2 -ppn 1 -genv OMP_NUM_THREADS=48 sh .github/workflows/script/chatbot/start_ft_mpt-7b-chat_mpi.sh"
 
       - name: Print Logs and Check Finetuning Status

diff --git a/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml b/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml
@@ -23,9 +23,9 @@ jobs:
 
       - name: Build Docker Image
         run: 
-          ITREX_VER=${{ github.sha }} || true
-
-          docker build --no-cache ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune
+          if [ $(docker images | grep chatbotinfer | wc -l) == 0 ]; then
+            docker build ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune;
+          fi
 
       - name: Start Docker Container
         run: |
@@ -35,7 +35,11 @@ jobs:
 
       - name: Run Inference Test
         run: |
-          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate neuralchat; python workflows/chatbot/inference/generate.py --base_model_path \"meta-llama/Llama-2-7b-chat-hf\" --hf_access_token \"${{ env.HF_ACCESS_TOKEN }}\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
+          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate neuralchat;\
+                 pip uninstall intel-extension-for-transformers -y; \
+                 pip install -r requirements.txt; \
+                 python setup.py install; \
+                 python workflows/chatbot/inference/generate.py --base_model_path \"meta-llama/Llama-2-7b-chat-hf\" --hf_access_token \"${{ env.HF_ACCESS_TOKEN }}\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
 
       - name: Stop Container
         if: success() || failure()

diff --git a/.github/workflows/chatbot-inference-mpt-7b-chat.yml b/.github/workflows/chatbot-inference-mpt-7b-chat.yml
@@ -23,9 +23,9 @@ jobs:
 
       - name: Build Docker Image
         run: 
-          ITREX_VER=${{ github.sha }} || true
-
-          docker build --no-cache ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune
+          if [ $(docker images | grep chatbotinfer | wc -l) == 0 ]; then
+            docker build ./ --target cpu --build-arg REPO=${{ github.server_url }}/${{ github.repository }}.git --build-arg ITREX_VER=${{ github.head_ref }} --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f intel_extension_for_transformers/neural_chat/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune;
+          fi
 
       - name: Start Docker Container
         run: |
@@ -35,7 +35,11 @@ jobs:
 
       - name: Run Inference Test
         run: |
-          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate neuralchat; python workflows/chatbot/inference/generate.py --base_model_path \"mosaicml/mpt-7b-chat\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
+          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate neuralchat; \
+                 pip uninstall intel-extension-for-transformers -y; \
+                 pip install -r requirements.txt; \
+                 python setup.py install; \
+                 python workflows/chatbot/inference/generate.py --base_model_path \"mosaicml/mpt-7b-chat\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
 
       - name: Stop Container
         if: success() || failure()

diff --git a/.github/workflows/chatbot-test.yml b/.github/workflows/chatbot-test.yml
@@ -41,8 +41,8 @@ jobs:
   call-inference-mpt-7b-chat:
     uses: ./.github/workflows/chatbot-inference-mpt-7b-chat.yml
 
-  call-finetune-mpt-7b-chat:
-    uses: ./.github/workflows/chatbot-finetune-mpt-7b-chat.yml
+  #call-finetune-mpt-7b-chat:
+  #  uses: ./.github/workflows/chatbot-finetune-mpt-7b-chat.yml
 
   #call-inference-llama-2-7b-chat-hf-hpu:
   #  uses: ./.github/workflows/chatbot-inference-llama-2-7b-chat-hf-hpu.yml

diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml
@@ -1,17 +1,17 @@
 name: Deploy Model Test
 
 on:
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/deploy-test.yml"
-      - ".github/workflows/script/models/run_deploy.sh"
-      - "intel_extension_for_transformers/llm/runtime/deprecated/**"
-      - "!intel_extension_for_transformers/llm/runtime/deprecated/kernels/**"
-      - "!intel_extension_for_transformers/llm/runtime/deprecated/test/**"
-      - "!intel_extension_for_transformers/llm/runtime/graph/**"
-      - "!intel_extension_for_transformers/llm/runtime/deprecated/third_party/**"
-      - "!intel_extension_for_transformers/llm/runtime/deprecated/docs/**"
+  #pull_request:
+  #  branches: [main]
+  #  paths:
+  #    - ".github/workflows/deploy-test.yml"
+  #    - ".github/workflows/script/models/run_deploy.sh"
+  #    - "intel_extension_for_transformers/llm/runtime/deprecated/**"
+  #    - "!intel_extension_for_transformers/llm/runtime/deprecated/kernels/**"
+  #    - "!intel_extension_for_transformers/llm/runtime/deprecated/test/**"
+  #    - "!intel_extension_for_transformers/llm/runtime/graph/**"
+  #    - "!intel_extension_for_transformers/llm/runtime/deprecated/third_party/**"
+  #    - "!intel_extension_for_transformers/llm/runtime/deprecated/docs/**"
 
   workflow_dispatch:
 

diff --git a/.github/workflows/optimize-test.yml b/.github/workflows/optimize-test.yml
@@ -1,16 +1,15 @@
 name: Optimize Model Test
 
 on:
-  pull_request:
-    branches: [main]
-    paths:
-      - intel_extension_for_transformers/transformers/**
-      - intel_extension_for_transformers/utils/**
-      - intel_extension_for_transformers/llm/evaluation/**
-      - intel_extension_for_transformers/llm/quantization/**
-      - '.github/workflows/optimize-test.yml'
-      - '.github/workflows/script/models/run_optimize.sh'
-
+  #pull_request:
+  #  branches: [main]
+  #  paths:
+  #    - intel_extension_for_transformers/transformers/**
+  #    - intel_extension_for_transformers/utils/**
+  #    - intel_extension_for_transformers/llm/evaluation/**
+  #    - intel_extension_for_transformers/llm/quantization/**
+  #    - '.github/workflows/optimize-test.yml'
+  #    - '.github/workflows/script/models/run_optimize.sh'
 
   workflow_dispatch:
 

diff --git a/.github/workflows/script/chatbot/prepare_ft_mpt-7b-chat_mpi.sh b/.github/workflows/script/chatbot/prepare_ft_mpt-7b-chat_mpi.sh
@@ -8,10 +8,12 @@ export I_MPI_HYDRA_IFACE=eth0
 EOF
 )"
 # for launching mpirun from yaml
-docker exec "chatbotfinetune-mpi-s0" bash -c "cd /root/chatbot; echo \"source activate && conda activate neuralchat\" > bash_setup.sh; echo \"$prepare_script\" >> bash_setup.sh; echo export MASTER_ADDR=$master_node >> bash_setup.sh"
+docker exec "chatbotfinetune-mpi-s0" bash -c "cd /root/chatbot; echo \"source activate && conda activate neuralchat\" > bash_setup.sh; echo export MASTER_ADDR=$master_node >> bash_setup.sh"
 # for ssh setup mpi and oneccl properly
 docker exec "chatbotfinetune-mpi-s0" bash -c "echo \"$prepare_script\" >> ~/.bashrc; echo export MASTER_ADDR=$master_node >> ~/.bashrc"
+
 docker exec "chatbotfinetune-mpi-s1" bash -c "echo \"$prepare_script\" >> ~/.bashrc; echo export MASTER_ADDR=$master_node >> ~/.bashrc"
 
+
 echo "$master_node" > ./hosts2
 echo "$slave_node" >> ./hosts2
diff --git a/.github/workflows/script/unitTest/run_unit_test_optimize.sh b/.github/workflows/script/unitTest/run_unit_test_optimize.sh
@@ -29,6 +29,8 @@ function pytest() {
     itrex_path=$(python -c 'import intel_extension_for_transformers; import os; print(os.path.dirname(intel_extension_for_transformers.__file__))')
     find . -name "test*.py" | sed 's,\.\/,coverage run --source='"${itrex_path}"' --append ,g' | sed 's/$/ --verbose/' >run.sh
     coverage erase
+    ## exclude tf UT
+    sed -i "s/test_tf.*.py//g" run.sh
 
     # run UT
     $BOLD_YELLOW && echo "cat run.sh..." && $RESET

diff --git a/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple b/examples/huggingface/pytorch/code-generation/quantization/Dockerfile-multiple
@@ -53,9 +53,14 @@ RUN apt-get install  git -y
 RUN python3 -m pip uninstall UNKNOWN -y
 RUN ln -s /usr/bin/python3 /usr/bin/python
 ENV COMPOSE_DOCKER_CLI_BUILD=0
-RUN wget https://raw.githubusercontent.com/intel/intel-extension-for-pytorch/master/scripts/compile_bundle.sh
-RUN bash compile_bundle.sh
-RUN python3 -m pip install intel_extension_for_transformers optimum
+
+# Source install torch and intel-extension-for-pytorch
+#RUN wget https://raw.githubusercontent.com/intel/intel-extension-for-pytorch/master/scripts/compile_bundle.sh
+#RUN bash compile_bundle.sh
+
+# Install torch and intel-extension-for-pytorch 2.1
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN python3 -m pip install intel-extension-for-pytorch intel-extension-for-transformers optimum
 RUN python3 -m pip install git+https://github.com/huggingface/optimum-intel.git@1f57059e9bd65380f93c8951f16da58d56ad0859
 RUN python3 -m pip install git+https://github.com/bigcode-project/bigcode-evaluation-harness@0d84db85f9ff971fa23a187a3347b7f59af288dc
 

diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md
@@ -13,15 +13,6 @@ pip install -r requirements.txt
 python setup.py install
 ```
 
-Here is how to install intel-extension-for-pytorch from source.
-```shell
-#  gcc version >= 11
-git clone https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git submodule sync && git submodule update --init --recursive
-python setup.py install
-```
-
 Required libraries.
 ```shell
 pip install -r requirements.txt
@@ -53,7 +44,7 @@ We use the gpt_bigcode definition script [modeling_gpt_bigcode.py](https://githu
 ## 1. Quantization
 ``` bash
 python run_generation.py \
-    --model bigcode/starcoderbase \
+    --model bigcode/starcoder \
     --output_dir "./saved_results" \
     --quantize \
     --sq \
@@ -75,7 +66,7 @@ export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 # --int8 is used for int8 model
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
-    --model bigcode/starcoderbase \
+    --model bigcode/starcoder \
     --output_dir "./saved_results" \
     --int8 \
     --ipex \
@@ -87,7 +78,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python ru
 ```bash
 # --int8 is used for int8 model
 python run_generation.py \
-    --model bigcode/starcoderbase \
+    --model bigcode/starcoder \
     --output_dir "./saved_results" \
     --int8 \    
     --ipex \

diff --git a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt
@@ -3,9 +3,10 @@ einops
 datasets >= 2.0
 protobuf
 sentencepiece != 0.1.92
-torch >= 1.10.0
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.1.0+cpu
 transformers
 neural-compressor
 git+https://github.com/huggingface/optimum.git
-git+https://github.com/huggingface/optimum-intel.git@1f57059e9bd65380f93c8951f16da58d56ad0859
+git+https://github.com/huggingface/optimum-intel.git
 git+https://github.com/bigcode-project/bigcode-evaluation-harness@0d84db85f9ff971fa23a187a3347b7f59af288dc
diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -1,6 +1,26 @@
 # Step-by-Step
-We provide the inference benchmarking script `run_generation.py` for [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B),  [decapoda-research/llama-7b-hf](https://huggingface.co/decapoda-research/llama-7b-hf), [decapoda-research/llama-13b-hf](https://huggingface.co/decapoda-research/llama-13b-hf), [lmsys/vicuna-7b-v1.3](https://huggingface.co/lmsys/vicuna-7b-v1.3), [databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b), [bigscience/bloom-7b1](https://huggingface.co/bigscience/bloom-7b1), [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b), [facebook/opt-2.7b](https://huggingface.co/facebook/opt-2.7b), [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b), [mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat), [Intel/neural-chat-7b-v1-1](https://huggingface.co/Intel/neural-chat-7b-v1-1), more models are working in progress.
+We provide the inference benchmarking script `run_generation.py` for large language models, The following are the models we validated, more models are working in progress.
 
+|Validated models| Smoothquant alpha |
+|---| ---|
+|[EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B)| 1.0 |
+|[decapoda-research/llama-7b-hf](https://huggingface.co/decapoda-research/llama-7b-hf)| 0.7 |
+|[decapoda-research/llama-13b-hf](https://huggingface.co/decapoda-research/llama-13b-hf)| 0.8 |
+|[lmsys/vicuna-7b-v1.3](https://huggingface.co/lmsys/vicuna-7b-v1.3)| 0.7 |
+|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)| 1.0 |
+|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)| 1.0 |
+|[databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-)| 0.5 |
+|[bigscience/bloom-560m](https://huggingface.co/bigscience/bloom-560m)| 0.5 |
+|[bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)| 0.5 |
+|[bigscience/bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)| 0.5 |
+|[bigscience/bloomz-560m](https://huggingface.co/bigscience/bloomz-560m)| 0.5 |
+|[bigscience/bloomz-1b7](https://huggingface.co/bigscience/bloomz-1b7)| 0.5 |
+|[bigscience/bloomz-7b1](https://huggingface.co/bigscience/bloomz-7b1)| 0.5 |
+|[facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b)| 0.5 |
+|[facebook/opt-2.7b](https://huggingface.co/facebook/opt-2.7b)| 0.5 |
+|[facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b)| 0.5 |
+|[mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat)| 1.0 |
+|[Intel/neural-chat-7b-v1-1](https://huggingface.co/Intel/neural-chat-7b-v1-1)| 1.0 |
 >**Note**: The default search algorithm is beam search with num_beams = 4, if you'd like to use greedy search for comparison, add "--greedy" in args.
 
 
@@ -9,17 +29,8 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI
 Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
-conda create -n llm python=3.9 -y
-conda activate llm
-bash build_env.sh
-git clone https://github.com/intel/intel-extension-for-transformers.git
-cd intel-extension-for-transformers
 pip install -r requirements.txt
-python setup.py install
 ```
-> Note:
-> Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other models don't need it.
-> `export _DNNL_DISABLE_COMPILER_BACKEND=1`
 
 > Note: If `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version.
 > ```bash
@@ -28,7 +39,6 @@ python setup.py install
 > ```
 
 
-
 # Run
 We support compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `RTN/AWQ/TEQ` algorithms and `BitsandBytes`, `load_in_4bit` and `load_in_8bit` work on CPU device are provided, the followings are command to show how to use it.