improve SQ/WOQ examples (#1662)

Signed-off-by: changwangss <[email protected]>
intel · Jul 12, 2024 · b12a11b · b12a11b
1 parent 79277b4
commit b12a11b
Show file tree

Hide file tree

Showing 13 changed files with 544 additions and 305 deletions.
diff --git a/examples/huggingface/pytorch/code-generation/quantization/README.md b/examples/huggingface/pytorch/code-generation/quantization/README.md
@@ -19,63 +19,46 @@ pip install -r requirements.txt
 
 # Run
 We provide compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `Rtn/Awq/Teq/GPTQ/AutoRound` algorithms and `BitsandBytes`, `load_in_4bit` and `load_in_8bit` work on CPU device, the followings are command to show how to use it.
->**Note**: 
-> Model type "llama" will default use [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/339bd251841e153ad9c34e1033ab8b2d936a1781/docs/tutorials/llm/llm_optimize_transformers.md) to accelerate the inference, but "llama" requests transformers version lower than 4.36.0, "falcon" requests transformers version lower than 4.33.3.
 
-## 1. Performance
+## MixedPrecison and SmoothQuant
+
+### 1. Performance
 ```bash
 export KMP_BLOCKTIME=1
 export KMP_SETTINGS=1
 export KMP_AFFINITY=granularity=fine,compact,1,0
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 # fp32
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_sq.py \
     --model bigcode/starcoder \
     --benchmark \
-    --batch_size 1
+    --benchmark_batch_size 1
+
 # mixedprecision
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_sq.py \
     --model bigcode/starcoder \
     --mixed_precision \
     --benchmark \
     --batch_size 1
+
 # smoothquant
 # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
-python run_generation.py \
+python run_generation_sq.py \
     --model bigcode/starcoder \
     --output_dir "./saved_results" \
     --sq \
     --alpha 0.7  \
-    --calib_iters 500 \
+    --calib_n_samples 500 \
     --dataset "mbpp"
-    --int8 \
-    --benchmark \
-    --batch_size 1
-# weightonlyquant
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
-    --model bigcode/starcoder \
-    --woq \
-    --benchmark \
-    --batch_size 1
-# load_in_4bit
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
-    --model bigcode/starcoder \
-    --load_in_4bit \
-    --benchmark \
-    --batch_size 1
-# load_in_8bit
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
-    --model bigcode/starcoder \
-    --load_in_8bit \
     --benchmark \
     --batch_size 1
 ```
-## 2. Accuracy
+### 2. Accuracy
 
 ```bash
 # fp32
-python run_generation.py \
+python run_generation_sq.py \
     --model bigcode/starcoder \
     --accuracy \
     --batch_size 20 \
@@ -85,7 +68,7 @@ python run_generation.py \
     --do_sample \
     --tasks "humaneval"
 # mixedprecision
-python run_generation.py \
+python run_generation_sq.py \
     --model bigcode/starcoder \
     --mixed_precision \
     --accuracy \
@@ -97,23 +80,53 @@ python run_generation.py \
     --tasks "humaneval"
 # smoothquant
 # [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
-python run_generation.py \
+python run_generation_sq.py \
     --model bigcode/starcoder \
     --sq \
     --alpha 1.0 \
-    --int8 \
     --accuracy \
     --batch_size 20 \
     --n_samples 20 \
     --allow_code_execution \
     --temperature 0.2 \
     --do_sample \
     --tasks "humaneval"
+```
+
+## WeightOnlyQuant
+
+1. ### Performance
+
+```bash
+# weightonlyquant
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
+    --model bigcode/starcoder \
+    --woq \
+    --benchmark \
+    --benchmark_batch_size 1
+# load_in_4bit
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
+    --model bigcode/starcoder \
+    --load_in_4bit \
+    --benchmark \
+    --benchmark_batch_size 1
+# load_in_8bit
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation_cpu_woq.py \
+    --model bigcode/starcoder \
+    --load_in_8bit \
+    --benchmark \
+    --benchmark_batch_size 1
+```
+
+2. ### Accuracy
+
+```bash
+
 # weightonlyquant
-python run_generation.py \
+python run_generation_cpu_woq.py \
     --model bigcode/starcoder \
     --woq \
-    --woq_weight_dtype "nf4" \
+    --weight_dtype "nf4" \
     --accuracy \
     --batch_size 20 \
     --n_samples 20 \
@@ -122,7 +135,7 @@ python run_generation.py \
     --do_sample \
     --tasks "humaneval"
 # load_in_4bit
-python run_generation.py \
+python run_generation_cpu_woq.py \
     --model bigcode/starcoder \
     --load_in_4bit \
     --accuracy \
@@ -133,7 +146,7 @@ python run_generation.py \
     --do_sample \
     --tasks "humaneval"
 # load_in_8bit
-python run_generation.py \
+python run_generation_cpu_woq.py \
     --model bigcode/starcoder \
     --load_in_8bit \
     --accuracy \
@@ -166,17 +179,14 @@ This creates an image called `evaluation-harness-multiple`, and runs a test on i
 Suppose the fp32 model is `starcoder-3b`, saved quantized model in `saved_results` and do evaluation on `multiple-lua` tasks with:
 ```
 docker run -v $(CURDIR):$(CURDIR) -it /bin/bash
-python3 run_generation.py \
+python3 run_generation_sq.py \
     --model $(CURDIR)/starcoder-3b \
-    --quantize  \
     --sq \
     --alpha 0.7 \
-    --ipex \
-    --calib_iters 500 \
+    --calib_n_samples 500 \
     --calib_batch_size 1 \
     --dataset "mbpp" \
     --output_dir "$(CURDIR)/saved_results" \
-    --int8 \
     --accuracy \
     --tasks multiple-py \
     --batch_size 20 \
@@ -191,9 +201,9 @@ python3 run_generation.py \
 To run the container (here from image `evaluation-harness-multiple`) to quantize and evaluate on `CURDIR`, or another file mount it with -v, specify n_samples and allow code execution with --allow_code_execution (and add the number of problems --limit if it was used during generation):
 ```bash
 docker run -v $(CURDIR):$(CURDIR) \
-    -it $(IMAGE_NAME) python3 run_generation.py --model $(CURDIR)/starcoder-3b --quantize   --sq --alpha 0.7 --ipex \
-    --calib_iters 5 --calib_batch_size 1 --dataset "mbpp" --calib_split "test" --output_dir "$(CURDIR)/saved_results" \
-    --int8 --accuracy --tasks multiple-py  --batch_size 20 --n_samples 20 --allow_code_execution \
+    -it $(IMAGE_NAME) python3 run_generation_sq.py --model $(CURDIR)/starcoder-3b   --sq --alpha 0.7
+    --calib_n_samples 5 --calib_batch_size 1 --dataset "mbpp"  --output_dir "$(CURDIR)/saved_results" \
+    --accuracy --tasks multiple-py  --batch_size 20 --n_samples 20 --allow_code_execution \
     --do_sample --temperature 0.2 --limit 2
 
 ```
diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/code-generation/quantization/run_benchmark.sh
@@ -14,7 +14,7 @@ function init_params {
   batch_size=1
   tuned_checkpoint=saved_results
   lm_eval_tasks="humaneval"
-  script="run_generation.py"
+  script="run_generation_sq.py"
   for var in "$@"
   do
     case $var in
@@ -85,7 +85,7 @@ function run_benchmark {
 
 
     if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
+        model_name_or_path=$tuned_checkpoint
     fi
 
     echo $extra_cmd