From 11abd174aebd25c21637d5735cc06d1132879e10 Mon Sep 17 00:00:00 2001 From: Nikita Agarwal Date: Sun, 14 Apr 2024 23:59:12 +0530 Subject: [PATCH 1/2] updated leaderboard README --- leaderboard/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/leaderboard/README.md b/leaderboard/README.md index 660c5e7be..574a6a2aa 100644 --- a/leaderboard/README.md +++ b/leaderboard/README.md @@ -21,6 +21,7 @@ Follow the setup instructions in the evaluation harness [README](https://github. Create two folders `generations_$model` and `metrics_$model` where you will save the generated code and the metrics respectively for your model `$model`. ```bash +model=YOUR_MODEL cd bigcode-evaluation-harness mkdir generations_$model mkdir metrics_$model @@ -58,6 +59,7 @@ for lang in "${langs[@]}"; do --trust_remote_code \ --use_auth_token \ --generation_only \ + --save_generations \ --save_generations_path $generations_path echo "Task $task done" done @@ -111,7 +113,7 @@ for lang in "${langs[@]}"; do task=multiple-$lang fi - gen_suffix=generations_$task\_$model.json + gen_suffix=generations_$task\_$model\_$task.json metric_suffix=metrics_$task\_$model.json echo "Evaluation of $model on $task benchmark, data in $generations_path/$gen_suffix" From 101788dad2c7add50a6bdaac5fe0db564e30af09 Mon Sep 17 00:00:00 2001 From: Nikita Agarwal Date: Wed, 17 Apr 2024 21:26:23 +0530 Subject: [PATCH 2/2] Fix for Issue #207 --- leaderboard/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/leaderboard/README.md b/leaderboard/README.md index 574a6a2aa..8993f7778 100644 --- a/leaderboard/README.md +++ b/leaderboard/README.md @@ -30,7 +30,8 @@ mkdir metrics_$model To run the evaluation, we first generate the code solutions for the target tasks on GPUs, then execute the code on a docker container (only cpus are needed). ### 2- Generation -Below are the instruction for generating the code solutions sequentially or in parallel with slurm. You might need to reduce the batch size for some models or change the precision based on your device. +Below are the instruction for generating the code solutions sequentially or in parallel with slurm. +You might need to reduce the batch size for some models, change the precision based on your device or change max_length to 1024 for some tasks based on your tokeniser. ```bash # after activating env and setting up accelerate... langs=(py js java cpp swift php d jl lua r rkt rs)