From 11f5da69bb3cde54e1251f32ff4cbe40f0be5ec2 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Thu, 7 Nov 2024 09:39:09 +0100 Subject: [PATCH] Add multi step scheduling scenario to jenkins CI (#445) Add multi step scheduling scenario to jenkins CI --- .../configs/Meta-Llama-3.1-8B-Instruct-mss.yaml | 16 ++++++++++++++++ .jenkins/lm-eval-harness/configs/models-mss.txt | 1 + .../lm-eval-harness/test_lm_eval_correctness.py | 3 +++ .jenkins/test_config.yaml | 14 ++++++++++++++ 4 files changed, 34 insertions(+) create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml create mode 100644 .jenkins/lm-eval-harness/configs/models-mss.txt diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml new file mode 100644 index 0000000000000..ff787f1085cba --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml @@ -0,0 +1,16 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true +num_scheduler_steps: 10 \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-mss.txt b/.jenkins/lm-eval-harness/configs/models-mss.txt new file mode 100644 index 0000000000000..cfcc3d42d108f --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-mss.txt @@ -0,0 +1 @@ +Meta-Llama-3.1-8B-Instruct-mss.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index 9a31f59b828a9..9272123034350 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -54,6 +54,9 @@ def launch_lm_eval(eval_config): model_args += ",quantization=inc," \ "kv_cache_dtype=fp8_inc," \ "weights_load_device=cpu" + if eval_config.get("num_scheduler_steps"): + model_args += \ + f",num_scheduler_steps={eval_config.get('num_scheduler_steps')}" kwargs = {} if 'fewshot_as_multiturn' in eval_config: kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn'] diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index b32563d6222e9..e57bd37c5fb7e 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -27,3 +27,17 @@ stages: - name: gsm8k_small_g3_tp1_fp8 flavor: g3 command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-fp8.txt -t 1 + - name: test_gsm8k_mss + steps: + - name: gsm8k_small_g3_tp1_mss + flavor: g3 + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-mss.txt -t 1 + - name: gsm8k_small_g2_tp1_mss + flavor: g2 + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-mss.txt -t 1 + - name: gsm8k_small_g3_tp2_mss + flavor: g3.s + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-mss.txt -t 2 + - name: gsm8k_small_g2_tp2_mss + flavor: g2.s + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-mss.txt -t 2 \ No newline at end of file