From 8cc954abea0de21a39c66b4ab266116f8c8c298f Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 25 Nov 2024 00:06:27 +0000 Subject: [PATCH] Misc improvements --- .../scaling/scaling_laws/fitting_functions.py | 12 +- olmo/scaling/scaling_laws/utils.py | 31 ++-- scripts/eval_hf_plot.py | 134 ++++++++++++++++++ scripts/scaling/eval_bpb_mc.py | 128 ----------------- scripts/scaling/predict.py | 15 +- scripts/scaling/single_step.py | 23 ++- scripts/scaling/step1.py | 19 ++- scripts/scaling/step1_flops.py | 57 ++++---- scripts/scaling/step2.py | 32 +++-- 9 files changed, 254 insertions(+), 197 deletions(-) create mode 100644 scripts/eval_hf_plot.py delete mode 100644 scripts/scaling/eval_bpb_mc.py diff --git a/olmo/scaling/scaling_laws/fitting_functions.py b/olmo/scaling/scaling_laws/fitting_functions.py index e5180e23e..88a1b82ac 100644 --- a/olmo/scaling/scaling_laws/fitting_functions.py +++ b/olmo/scaling/scaling_laws/fitting_functions.py @@ -43,17 +43,17 @@ def get_std_errors(xs, ys, coefficients, cov, fitting_func, grad_fitting_func): # x = flops -# p[0] = A, p[1] = B, p[2] = E +# p[0] = a = log(A), p[1] = alpha, p[2] = E def chinchilla_flops_fit(x, p): - # return ax**b + E - return p[0] * np.pow(x, p[1]) + p[2] + # return e**a / x**alpha + E + return np.exp(p[0]) / x ** p[1] + p[2] def grad_chinchilla_flops_fit(x, p): - grad_A = np.pow(x, p[1]) - grad_B = p[0] * np.pow(x, p[1]) * np.log(x) + grad_a = np.exp(p[0]) / x ** p[1] + grad_alpha = np.exp(p[0]) * (-np.log(x)) / x ** p[1] grad_E = 1 - return [grad_A, grad_B, grad_E] + return [grad_a, grad_alpha, grad_E] # x[0] = d, x[1] = h diff --git a/olmo/scaling/scaling_laws/utils.py b/olmo/scaling/scaling_laws/utils.py index 49029cd6d..21934174c 100644 --- a/olmo/scaling/scaling_laws/utils.py +++ b/olmo/scaling/scaling_laws/utils.py @@ -236,7 +236,15 @@ def get_log_soft_loss_keys(self): "boolq_val": 0.5, "winogrande_val": 0.5, } -v2_maximums_rc: Dict[str, float] = {} +v2_maximums_rc: Dict[str, float] = { + # "mmlu_avg_test": 1.06, + # "arc_challenge_test": 1.65, + # "arc_easy_test": 1.40, + # "piqa_val": 1.53, + # "csqa_val": 1.10, + # "socialiqa_val": 0.73, + # "openbookqa_test": 1.94, +} v2_core_names = [ "hellaswag_val", @@ -317,8 +325,8 @@ def get_log_soft_loss_keys(self): task_accuracy_key=[f"eval/downstream/{key}_rc_5shot_len_norm" for key in v2_mmlu_val_names], task_mc_loss_key=[f"eval/downstream_bpb/{key}_mc_5shot_bpb" for key in v2_mmlu_val_names], task_mc_accuracy_key=[f"eval/downstream/{key}_mc_5shot_len_norm" for key in v2_mmlu_val_names], - task_minimum=0.25, - task_maximum=1.0, + task_minimum=v2_minimums_rc.get("mmlu_avg_val", 0.25), + task_maximum=v2_maximums_rc.get("mmlu_avg_val", 1.0), ) } @@ -330,8 +338,8 @@ def get_log_soft_loss_keys(self): task_accuracy_key=[f"eval/downstream/{key}_rc_5shot_len_norm" for key in v2_mmlu_test_names], task_mc_loss_key=[f"eval/downstream_bpb/{key}_mc_5shot_bpb" for key in v2_mmlu_test_names], task_mc_accuracy_key=[f"eval/downstream/{key}_mc_5shot_len_norm" for key in v2_mmlu_test_names], - task_minimum=0.25, - task_maximum=1.0, # 0.9, + task_minimum=v2_minimums_rc.get("mmlu_avg_test", 0.25), + task_maximum=v2_maximums_rc.get("mmlu_avg_test", 1.0), ) } @@ -673,6 +681,8 @@ def get_step1_data_by_name(configs, task_name, y_metric="rc_bpb", moving_avg=1): keys = task.get_loss_keys() elif y_metric == "rc_acc": keys = task.get_accuracy_keys() + elif y_metric == "c4": + keys = ["eval/c4_en-validation/CrossEntropyLoss"] else: raise ValueError(f"Invalid y_metric: {y_metric}") @@ -688,7 +698,7 @@ def get_step1_data_by_name(configs, task_name, y_metric="rc_bpb", moving_avg=1): ds, ys, fs = [], [], [] for row in rows: d = int(float(row["throughput/total_tokens"])) - f = d * MODEL_FLOPS[name.split("-")[0]] + f = float(d * MODEL_FLOPS[name.split("-")[0]]) y = np.average( [float(row[key]) for key in keys], weights=[WEIGHT_BY_KEY.get(key, 1.0) for key in keys] ) @@ -744,9 +754,14 @@ def get_length(path): return "" -def get_step2_data_by_name(configs, task_name, y_metric="rc_acc", moving_avg=1, skip_perc=0.0, last_n_points=-1): +def get_step2_data_by_name(configs, task_name, x_metric="rc_bpb", y_metric="rc_acc", moving_avg=1, skip_perc=0.0, last_n_points=-1): task = tasks[task_name] - loss_keys = task.get_loss_keys() + if x_metric == "rc_bpb": + loss_keys = task.get_loss_keys() + elif x_metric == "c4": + loss_keys = ["eval/c4_en-validation/CrossEntropyLoss"] + else: + raise ValueError(f"Invalid x_metric: {x_metric}") if y_metric == "rc_acc": accuracy_keys = task.get_accuracy_keys() elif y_metric == "mc_acc": diff --git a/scripts/eval_hf_plot.py b/scripts/eval_hf_plot.py new file mode 100644 index 000000000..9aaa4b422 --- /dev/null +++ b/scripts/eval_hf_plot.py @@ -0,0 +1,134 @@ +import json + +import matplotlib.pyplot as plt +import numpy as np + +MODELS = [ + "allenai/OLMo-7B-0724-hf", + # 'allenai/OLMo-1B-0724-hf', + # 'allenai/OLMo-7B-0424-hf', + "allenai/OLMo-7B-hf", + "allenai/OLMo-1B-hf", + "meta-llama/Llama-3.2-3B", + "meta-llama/Llama-3.2-1B", + # 'meta-llama/Llama-3.1-70B', + "meta-llama/Llama-3.1-8B", + # 'meta-llama/Meta-Llama-3-70B', + "meta-llama/Meta-Llama-3-8B", + # 'meta-llama/Llama-2-70b-hf', + # 'meta-llama/Llama-2-13b-hf', + # 'meta-llama/Llama-2-7b-hf', + # 'google/gemma-2-27b', + # 'google/gemma-2-9b', + # 'google/gemma-2-2b', + # 'google/gemma-7b', + # 'google/gemma-2b', + # 'Qwen/Qwen2.5-72B', + # 'Qwen/Qwen2.5-32B', + "Qwen/Qwen2.5-14B", + "Qwen/Qwen2.5-7B", + "Qwen/Qwen2.5-3B", + "Qwen/Qwen2.5-1.5B", + # 'Qwen/Qwen2-72B', + "Qwen/Qwen2-7B", + "Qwen/Qwen2-1.5B", + "mistralai/Mistral-Nemo-Base-2407", + "mistralai/Mistral-7B-v0.3", + "mistralai/Mistral-7B-v0.1", +] + +COLOR_BY_MODEL_PREFIX = { + "allenai": "hotpink", + "meta-llama/Llama-3.2": "darkblue", + "meta-llama/Llama-3.1": "mediumblue", + "meta-llama/Meta-Llama-3": "royalblue", + "meta-llama/Llama-2": "cornflowerblue", + "google/gemma-2-": "darkgreen", + "google/gemma-": "forestgreen", + "Qwen/Qwen2.5": "darkviolet", + "Qwen/Qwen2": "violet", + "mistralai": "darkorange", +} + + +def get_color(model): + for prefix, color in COLOR_BY_MODEL_PREFIX.items(): + if model.startswith(prefix): + return color + return "black" + + +METRICS_BY_TASK = { + "rc_rc_mmlu": [ + ("mmlu_stem_test_rc_5shot_bpb", "mmlu_stem_test_rc_5shot_len_norm", 0.215), + ("mmlu_humanities_test_rc_5shot_bpb", "mmlu_humanities_test_rc_5shot_len_norm", 0.335), + ("mmlu_social_sciences_test_rc_5shot_bpb", "mmlu_social_sciences_test_rc_5shot_len_norm", 0.219), + ("mmlu_other_test_rc_5shot_bpb", "mmlu_other_test_rc_5shot_len_norm", 0.231), + ], + "rc_rc_hellaswag": [("hellaswag_val_rc_5shot_bpb", "hellaswag_val_rc_5shot_len_norm", 1.0)], + "rc_rc_arc-c": [("arc_challenge_test_rc_5shot_bpb", "arc_challenge_test_rc_5shot_len_norm", 1.0)], + "rc_rc_arc-e": [("arc_easy_test_rc_5shot_bpb", "arc_easy_test_rc_5shot_len_norm", 1.0)], + "rc_rc_piqa": [("piqa_val_rc_5shot_bpb", "piqa_val_rc_5shot_len_norm", 1.0)], + "rc_rc_csqa": [("csqa_val_rc_5shot_bpb", "csqa_val_rc_5shot_len_norm", 1.0)], + "rc_rc_socialiqa": [("socialiqa_val_rc_5shot_bpb", "socialiqa_val_rc_5shot_len_norm", 1.0)], + "rc_rc_openbookqa": [("openbookqa_test_rc_5shot_bpb", "openbookqa_test_rc_5shot_len_norm", 1.0)], + "rc_mc_mmlu": [ + ("mmlu_stem_test_rc_5shot_bpb", "mmlu_stem_test_mc_5shot_len_norm", 0.215), + ("mmlu_humanities_test_rc_5shot_bpb", "mmlu_humanities_test_mc_5shot_len_norm", 0.335), + ("mmlu_social_sciences_test_rc_5shot_bpb", "mmlu_social_sciences_test_mc_5shot_len_norm", 0.219), + ("mmlu_other_test_rc_5shot_bpb", "mmlu_other_test_mc_5shot_len_norm", 0.231), + ], + "rc_mc_hellaswag": [("hellaswag_val_rc_5shot_bpb", "hellaswag_val_mc_5shot_acc", 1.0)], + "rc_mc_arc-c": [("arc_challenge_test_rc_5shot_bpb", "arc_challenge_test_mc_5shot_acc", 1.0)], + "rc_mc_arc-e": [("arc_easy_test_rc_5shot_bpb", "arc_easy_test_mc_5shot_acc", 1.0)], + "rc_mc_piqa": [("piqa_val_rc_5shot_bpb", "piqa_val_mc_5shot_acc", 1.0)], + "rc_mc_csqa": [("csqa_val_rc_5shot_bpb", "csqa_val_mc_5shot_acc", 1.0)], + "rc_mc_socialiqa": [("socialiqa_val_rc_5shot_bpb", "socialiqa_val_mc_5shot_acc", 1.0)], + "rc_mc_openbookqa": [("openbookqa_test_rc_5shot_bpb", "openbookqa_test_mc_5shot_acc", 1.0)], + "mc_mc_mmlu": [ + ("mmlu_stem_test_mc_5shot_bpb", "mmlu_stem_test_mc_5shot_len_norm", 0.215), + ("mmlu_humanities_test_mc_5shot_bpb", "mmlu_humanities_test_mc_5shot_len_norm", 0.335), + ("mmlu_social_sciences_test_mc_5shot_bpb", "mmlu_social_sciences_test_mc_5shot_len_norm", 0.219), + ("mmlu_other_test_mc_5shot_bpb", "mmlu_other_test_mc_5shot_len_norm", 0.231), + ], + "mc_mc_hellaswag": [("hellaswag_val_mc_5shot_bpb", "hellaswag_val_mc_5shot_acc", 1.0)], + "mc_mc_arc-c": [("arc_challenge_test_mc_5shot_bpb", "arc_challenge_test_mc_5shot_acc", 1.0)], + "mc_mc_arc-e": [("arc_easy_test_mc_5shot_bpb", "arc_easy_test_mc_5shot_acc", 1.0)], + "mc_mc_piqa": [("piqa_val_mc_5shot_bpb", "piqa_val_mc_5shot_acc", 1.0)], + "mc_mc_csqa": [("csqa_val_mc_5shot_bpb", "csqa_val_mc_5shot_acc", 1.0)], + "mc_mc_socialiqa": [("socialiqa_val_mc_5shot_bpb", "socialiqa_val_mc_5shot_acc", 1.0)], + "mc_mc_openbookqa": [("openbookqa_test_mc_5shot_bpb", "openbookqa_test_mc_5shot_acc", 1.0)], +} + +fig, axs = plt.subplots(8, 3, figsize=(3 * 6, 8 * 4.5)) + +for i, (task, metrics) in enumerate(METRICS_BY_TASK.items()): + ax = axs[i % 8, i // 8] + for model in MODELS: + with open(f'wandb/eval_bpb_mc_v2/{model.replace("/", "_")}.json') as f: + data = json.load(f) + try: + rc_bpb = np.average( + [data[f"eval/downstream_bpb/{metric[0]}"] for metric in metrics], + weights=[metric[2] for metric in metrics], + ) + acc = np.average( + [data[f"eval/downstream/{metric[1]}"] for metric in metrics], + weights=[metric[2] for metric in metrics], + ) + except KeyError: + continue + color = get_color(model) + ax.scatter([rc_bpb], [acc], color=color, s=100) + ax.annotate( + text=model.split("/")[1], + xy=(float(rc_bpb), float(acc)), + xytext=(8, -3), + textcoords="offset points", + fontsize=8, + ) + ax.set_xlabel(f'{task.split("_")[0]} bpb') + ax.set_ylabel(f'{task.split("_")[1]} acc') + ax.set_title(task) + +plt.savefig("figure/peteish-moreeval/external.png", dpi=300, bbox_inches="tight") diff --git a/scripts/scaling/eval_bpb_mc.py b/scripts/scaling/eval_bpb_mc.py deleted file mode 100644 index 53ccccb8d..000000000 --- a/scripts/scaling/eval_bpb_mc.py +++ /dev/null @@ -1,128 +0,0 @@ -import json - -import matplotlib.pyplot as plt -import numpy as np - -MODELS = [ - "allenai/OLMo-7B-0724-hf", - # 'allenai/OLMo-1B-0724-hf', - # 'allenai/OLMo-7B-0424-hf', - "allenai/OLMo-7B-hf", - "allenai/OLMo-1B-hf", - "meta-llama/Llama-3.2-3B", - "meta-llama/Llama-3.2-1B", - # 'meta-llama/Llama-3.1-70B', - "meta-llama/Llama-3.1-8B", - # 'meta-llama/Meta-Llama-3-70B', - "meta-llama/Meta-Llama-3-8B", - # 'meta-llama/Llama-2-70b-hf', - # 'meta-llama/Llama-2-13b-hf', - # 'meta-llama/Llama-2-7b-hf', - # 'google/gemma-2-27b', - # 'google/gemma-2-9b', - # 'google/gemma-2-2b', - # 'google/gemma-7b', - # 'google/gemma-2b', - # 'Qwen/Qwen2.5-72B', - # 'Qwen/Qwen2.5-32B', - "Qwen/Qwen2.5-14B", - "Qwen/Qwen2.5-7B", - "Qwen/Qwen2.5-3B", - "Qwen/Qwen2.5-1.5B", - # 'Qwen/Qwen2-72B', - "Qwen/Qwen2-7B", - "Qwen/Qwen2-1.5B", - "mistralai/Mistral-Nemo-Base-2407", - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-v0.1", -] - -COLOR_BY_MODEL_PREFIX = { - "allenai": "hotpink", - "meta-llama/Llama-3.2": "darkblue", - "meta-llama/Llama-3.1": "mediumblue", - "meta-llama/Meta-Llama-3": "royalblue", - "meta-llama/Llama-2": "cornflowerblue", - "google/gemma-2-": "darkgreen", - "google/gemma-": "forestgreen", - "Qwen/Qwen2.5": "darkviolet", - "Qwen/Qwen2": "violet", - "mistralai": "darkorange", -} - - -def get_color(model): - for prefix, color in COLOR_BY_MODEL_PREFIX.items(): - if model.startswith(prefix): - return color - return "black" - - -METRICS_BY_TASK = { - "rc_rc_mmlu": [ - ("mmlu_stem_var_bpb", "mmlu_stem_var_len_norm", 0.215), - ("mmlu_humanities_var_bpb", "mmlu_humanities_var_len_norm", 0.335), - ("mmlu_social_sciences_var_bpb", "mmlu_social_sciences_var_len_norm", 0.219), - ("mmlu_other_var_bpb", "mmlu_other_var_len_norm", 0.231), - ], - "rc_rc_hellaswag": [("hellaswag_rc_5shot_bpb", "hellaswag_rc_5shot_len_norm", 1.0)], - "rc_rc_arc-c": [("arc_challenge_rc_5shot_bpb", "arc_challenge_rc_5shot_len_norm", 1.0)], - "rc_rc_piqa": [("piqa_rc_5shot_bpb", "piqa_rc_5shot_len_norm", 1.0)], - "rc_rc_csqa": [("csqa_rc_5shot_bpb", "csqa_rc_5shot_len_norm", 1.0)], - "rc_rc_socialiqa": [("socialiqa_rc_5shot_bpb", "socialiqa_rc_5shot_len_norm", 1.0)], - "rc_mc_mmlu": [ - ("mmlu_stem_var_bpb", "mmlu_stem_mc_5shot_len_norm", 0.215), - ("mmlu_humanities_var_bpb", "mmlu_humanities_mc_5shot_len_norm", 0.335), - ("mmlu_social_sciences_var_bpb", "mmlu_social_sciences_mc_5shot_len_norm", 0.219), - ("mmlu_other_var_bpb", "mmlu_other_mc_5shot_len_norm", 0.231), - ], - "rc_mc_hellaswag": [("hellaswag_rc_5shot_bpb", "hellaswag_mc_5shot_acc", 1.0)], - "rc_mc_arc-c": [("arc_challenge_rc_5shot_bpb", "arc_challenge_mc_5shot_acc", 1.0)], - "rc_mc_piqa": [("piqa_rc_5shot_bpb", "piqa_mc_5shot_acc", 1.0)], - "rc_mc_csqa": [("csqa_rc_5shot_bpb", "csqa_mc_5shot_acc", 1.0)], - "rc_mc_socialiqa": [("socialiqa_rc_5shot_bpb", "socialiqa_mc_5shot_acc", 1.0)], - "mc_mc_mmlu": [ - ("mmlu_stem_mc_5shot_bpb", "mmlu_stem_mc_5shot_len_norm", 0.215), - ("mmlu_humanities_mc_5shot_bpb", "mmlu_humanities_mc_5shot_len_norm", 0.335), - ("mmlu_social_sciences_mc_5shot_bpb", "mmlu_social_sciences_mc_5shot_len_norm", 0.219), - ("mmlu_other_mc_5shot_bpb", "mmlu_other_mc_5shot_len_norm", 0.231), - ], - "mc_mc_hellaswag": [("hellaswag_mc_5shot_bpb", "hellaswag_mc_5shot_acc", 1.0)], - "mc_mc_arc-c": [("arc_challenge_mc_5shot_bpb", "arc_challenge_mc_5shot_acc", 1.0)], - "mc_mc_piqa": [("piqa_mc_5shot_bpb", "piqa_mc_5shot_acc", 1.0)], - "mc_mc_csqa": [("csqa_mc_5shot_bpb", "csqa_mc_5shot_acc", 1.0)], - "mc_mc_socialiqa": [("socialiqa_mc_5shot_bpb", "socialiqa_mc_5shot_acc", 1.0)], -} - -fig, axs = plt.subplots(6, 3, figsize=(3 * 6, 6 * 4.5)) - -for i, (task, metrics) in enumerate(METRICS_BY_TASK.items()): - ax = axs[i % 6, i // 6] - for model in MODELS: - with open(f'wandb/eval_bpb_mc/{model.replace("/", "_")}.json') as f: - data = json.load(f) - try: - rc_bpb = np.average( - [data[f"eval/downstream_bpb/{metric[0]}_bpb"] for metric in metrics], - weights=[metric[2] for metric in metrics], - ) - acc = np.average( - [data[f"eval/downstream/{metric[1]}"] for metric in metrics], - weights=[metric[2] for metric in metrics], - ) - except KeyError: - continue - color = get_color(model) - ax.scatter([rc_bpb], [acc], color=color, s=100) - ax.annotate( - text=model.split("/")[1], - xy=(float(rc_bpb), float(acc)), - xytext=(8, -3), - textcoords="offset points", - fontsize=8, - ) - ax.set_xlabel(f'{task.split("_")[0]} bpb') - ax.set_ylabel(f'{task.split("_")[1]} acc') - ax.set_title(task) - -plt.savefig("wandb/eval_bpb_mc/all.png", dpi=300, bbox_inches="tight") diff --git a/scripts/scaling/predict.py b/scripts/scaling/predict.py index 73f135f3e..b9dc8b1c5 100644 --- a/scripts/scaling/predict.py +++ b/scripts/scaling/predict.py @@ -1,5 +1,7 @@ -# python scripts/scaling/predict.py -k main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 6887575552 -d 3945065873408 -t 7b -# python scripts/scaling/predict.py -k main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 13202396160 -d 5000088518656 -t 13b +# python scripts/scaling/predict.py -k v2_main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 6887575552 -d 3945065873408 -t 7b --skip_perc 0.1 --moving_avg 5 +# python scripts/scaling/predict.py -k v2_main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 13202396160 -d 5000088518656 -t 13b --skip_perc 0.1 --moving_avg 5 +# python scripts/scaling/predict.py -k v2_main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 6887575552 -d 3945065873408 -t 7b --skip_perc 0.1 --moving_avg 5 --x_metric c4 +# python scripts/scaling/predict.py -k v2_main -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2.json -n 13202396160 -d 5000088518656 -t 13b --skip_perc 0.1 --moving_avg 5 --x_metric c4 # python scripts/scaling/predict.py -k main_mc -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2_mc.json -y mc_acc -n 6887575552 -d 3945065873408 -t 7b-4T-final # python scripts/scaling/predict.py -k main_mc -c scripts/scaling/final.json --step2-config-path scripts/scaling/step2_mc.json -y mc_acc -n 13202396160 -d 5000088518656 -t 13b-5T-final @@ -23,6 +25,9 @@ def parse_args(): parser.add_argument( "-k", "--keys", nargs="+", default=[], help="For avg metrics. Use one of [all-val-lm, all-bpb]" ) + parser.add_argument( + "-x", "--x_metric", default="rc_bpb", choices=["rc_bpb", "c4"], help="Metric as input" + ) parser.add_argument( "-y", "--y_metric", default="rc_acc", choices=["rc_acc", "mc_acc"], help="Metric to predict" ) @@ -60,13 +65,13 @@ def main(): for r, task_name in enumerate(args.keys): # Step 1 step1_data_by_name = get_step1_data_by_name( - configs, task_name, y_metric="rc_bpb", moving_avg=args.moving_avg + configs, task_name, y_metric=args.x_metric, moving_avg=args.moving_avg ) - step1_coefficients = fit_step1(step1_data_by_name, y_metric="rc_bpb") + step1_coefficients, _ = fit_step1(step1_data_by_name, y_metric=args.x_metric) # Step 2 step2_data_by_name = get_step2_data_by_name( - step2_configs, task_name, y_metric=args.y_metric, moving_avg=args.moving_avg, skip_perc=args.skip_perc + step2_configs, task_name, x_metric=args.x_metric, y_metric=args.y_metric, moving_avg=args.moving_avg, skip_perc=args.skip_perc ) step2_coefficients, _ = fit_step2(step2_data_by_name, task_name, args.y_metric) diff --git a/scripts/scaling/single_step.py b/scripts/scaling/single_step.py index 9802dd2a3..a58b1dd66 100644 --- a/scripts/scaling/single_step.py +++ b/scripts/scaling/single_step.py @@ -1,3 +1,5 @@ +# python scripts/scaling/single_step.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/single_step_main.png --moving_avg 5 + import argparse import matplotlib.pyplot as plt @@ -61,7 +63,7 @@ def predict_step12(data_by_name, coefficients): plotted_predicted_data_by_name = {} dmin = 0.8 * min([min(data["ds"]) for data in data_by_name.values()]) - dmax = 1.2 * max([max(data["ds"]) for data in data_by_name.values()]) + dmax = 1.5 * max([max(data["ds"]) for data in data_by_name.values()]) for name, data in data_by_name.items(): predicted_data_by_name[name] = { @@ -111,8 +113,9 @@ def plot_step12( d, y, color=config.color, - marker=MARKERS[i] if config.mode == "train" else "o", - s=50, + marker=MARKERS[i] if config.mode == "train" else "x", + s=50 if config.mode == "train" else 20, + label=f"{config.label} (target)" if config.mode == "eval" else None, ) for d, y, y_pred in zip(data["ds"], data["ys"], predicted_data["ys"]): @@ -120,14 +123,22 @@ def plot_step12( if config.mode == "train": unsigned_rel_errors.append(np.abs(rel_error)) else: + ax.scatter( + d, + y_pred, + color=config.color, + marker="o", + s=20, + # label=f"{config.label} ({'predicted'})", + ) ax.annotate( - f"{prettify(rel_error)}", + f"{abs(rel_error * 100):.1f}%", (d, y), textcoords="offset points", xytext=(3, 3), ha="left", va="bottom", - fontsize=8, + fontsize=10, color=config.color, ) avg_unsigned_rel_error = np.mean(unsigned_rel_errors) @@ -160,7 +171,7 @@ def main(): sns.set_style("whitegrid") num_tasks = len(args.keys) - num_cols = min(4, num_tasks) + num_cols = min(3, num_tasks) num_rows = (num_tasks + num_cols - 1) // num_cols fig, axes = plt.subplots(num_rows, num_cols, figsize=(3.75 * num_cols, 3.25 * num_rows), squeeze=False) diff --git a/scripts/scaling/step1.py b/scripts/scaling/step1.py index 9d2727e9c..23c4d5858 100644 --- a/scripts/scaling/step1.py +++ b/scripts/scaling/step1.py @@ -1,5 +1,6 @@ -# python scripts/scaling/step1.py -k main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_main.png -# python scripts/scaling/step1.py -k core_small_avg -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_core_small_avg.png +# python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_main.png --moving_avg 5 +# python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_c4_main.png --y_metric c4 --moving_avg 5 +# python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_acc_main.png --y_metric rc_acc import argparse from typing import Any, List, Tuple @@ -29,7 +30,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--keys", nargs="+", default=[], help="Key(s) for tasks") parser.add_argument( - "-y", "--y_metric", default="rc_bpb", choices=["rc_bpb", "rc_acc"], help="Metric to predict" + "-y", "--y_metric", default="rc_bpb", choices=["rc_bpb", "rc_acc", "c4"], help="Metric to predict" ) parser.add_argument("--moving_avg", type=int, default=1, help="Moving average for bpb loss") parser.add_argument("-c", "--config-path", type=str, required=True, help="Path to config file") @@ -52,9 +53,13 @@ def fit_step1(data_by_name, y_metric): bounds: List[Tuple[Any, Any]] - if y_metric == "rc_bpb": + if y_metric == "rc_bpb" or y_metric == "c4": p0 = [3.0, 6.0, 0.1, 0.2, 1.0] bounds = [(0, None), (0, None), (0, None), (0, None), (0, None)] + # p0 = [3.0, 6.0, 0.25, 0.3, 1.0] + # bounds = [(0, None), (0, None), (0.25, 0.4), (0.19, 0.31), (0, None)] # moving_avg=1 + # # bounds = [(0, None), (0, None), (0, 0.3), (0.25, 0.45), (0, None)] # moving_avg=10 + # # bounds = [(0, None), (0, None), (0.15, 0.4), (0.3, 0.33), (0, None)] coefficients, cov = get_coefficients_huber( train_nds, train_ys, @@ -106,7 +111,7 @@ def predict_step1(configs, data_by_name, coefficients, y_metric): dmin = 0.8 * min([min(data["ds"]) for data in data_by_name.values()]) dmax = 1.5 * max([max(data["ds"]) for data in data_by_name.values()]) - if y_metric == "rc_bpb": + if y_metric == "rc_bpb" or y_metric == "c4": func = chinchilla_n_d_fit elif y_metric == "rc_acc": func = chinchilla_n_d_negated_fit @@ -245,6 +250,8 @@ def plot_step1( ax.set_ylabel("Task loss") elif y_metric == "rc_acc": ax.set_ylabel("Task RC accuracy") + elif y_metric == "c4": + ax.set_ylabel("C4 loss") else: raise ValueError(f"Unknown y_metric: {y_metric}") ax.set_title( @@ -259,7 +266,7 @@ def main(): sns.set_style("whitegrid") num_tasks = len(args.keys) - num_cols = min(4, num_tasks) + num_cols = min(3, num_tasks) num_rows = (num_tasks + num_cols - 1) // num_cols fitting_error = 0 diff --git a/scripts/scaling/step1_flops.py b/scripts/scaling/step1_flops.py index 043379ee0..4a2849d15 100644 --- a/scripts/scaling/step1_flops.py +++ b/scripts/scaling/step1_flops.py @@ -1,5 +1,4 @@ -# python scripts/scaling/step1.py -k main -c scripts/scaling/final.json -o figure/peteish-final/step1_main.png -# python scripts/scaling/step1.py -k core_small_avg -c scripts/scaling/final.json -o figure/peteish-final/step1_core_small_avg.png +# python scripts/scaling/step1_flops.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_flops_main.png --moving_avg 5 import argparse @@ -7,7 +6,11 @@ import numpy as np import seaborn as sns -from olmo.scaling.scaling_laws.fitting_functions import get_coefficients +from olmo.scaling.scaling_laws.fitting_functions import ( + chinchilla_flops_fit, + get_coefficients_huber, + grad_chinchilla_flops_fit, +) from olmo.scaling.scaling_laws.utils import ( get_final_configs, get_step1_data_by_name, @@ -36,11 +39,6 @@ def parse_args(): return args -def chinchilla_flops(x, a, b, E): - # return ax**b + E - return a * np.pow(x, b) + E - - def fit_step1(data_by_name, y_metric): train_fs, train_ys = [], [] for name, data in data_by_name.items(): @@ -49,17 +47,16 @@ def fit_step1(data_by_name, y_metric): train_ys += data["ys"] if y_metric == "rc_bpb": - p0 = [2.0, -0.3, 0.1] - - # test_output = chinchilla_flops(train_fs, *p0) - - bounds = ([0, -np.inf, -np.inf], [np.inf, 0, np.inf]) - coefficients, cov = get_coefficients( + p0 = [3.0, 0.1, 1.0] + bounds = [(0, None), (0, 1.0), (0, None)] + coefficients, cov = get_coefficients_huber( train_fs, train_ys, - chinchilla_flops, - p0, + chinchilla_flops_fit, + grad_chinchilla_flops_fit, + p0=p0, bounds=bounds, + max_iter=1000000, disp=False, return_cov=True, ) @@ -87,24 +84,24 @@ def predict_step1(configs, data_by_name, coefficients, y_metric): unsigned_rel_errors = [] fmin = 0.8 * min([min(data["fs"]) for data in data_by_name.values()]) - fmax = 1.2 * max([max(data["fs"]) for data in data_by_name.values()]) + fmax = 1.5 * max([max(data["fs"]) for data in data_by_name.values()]) if y_metric == "rc_bpb": - func = chinchilla_flops + func = chinchilla_flops_fit elif y_metric == "rc_acc": - func = chinchilla_flops + func = chinchilla_flops_fit else: raise ValueError(f"Unknown y_metric: {y_metric}") for name, data in data_by_name.items(): predicted_data_by_name[name] = { "fs": data["fs"], - "ys": [func(f, *coefficients) for f in data["fs"]], + "ys": [func(f, coefficients) for f in data["fs"]], } fs = np.exp(np.linspace(np.log(fmin), np.log(fmax), 100)) plotted_predicted_data_by_name[name] = { "fs": fs, - "ys": [func(f, *coefficients) for f in fs], + "ys": [func(f, coefficients) for f in fs], } if configs[name].mode == "eval": @@ -121,8 +118,9 @@ def predict_step1(configs, data_by_name, coefficients, y_metric): def str_chinchilla_flops_fit(coefficients): - a, b, E = coefficients - return f"L(F) = {a:.2f}F^{b:.2f} + {E:.2f}" + a, alpha, E = coefficients + A = np.exp(a) + return f"L(F) = {A:.2f} / F^{alpha:.2f} + {E:.2f}" def plot_step1( @@ -166,11 +164,12 @@ def plot_step1( ax.plot( data["fs"], data["ys"], - color=config.color, + color="black", linestyle="--", linewidth=1.5, - label=f'{config.label} ({"fitted" if config.mode == "train" else "predicted"})', + # label=f'{config.label} ({"fitted" if config.mode == "train" else "predicted"})', ) + break # plot the actual and predicted data unsigned_rel_errors = [] @@ -184,7 +183,7 @@ def plot_step1( y, color=config.color, marker=MARKERS[i] if config.mode == "train" else "x", - s=50 if config.mode == "train" else 10, + s=50 if config.mode == "train" else 20, label=f"{config.label} (target)" if config.mode == "eval" else None, ) @@ -198,11 +197,11 @@ def plot_step1( y_pred, color=config.color, marker="o", - s=10, + s=20, label=f"{config.label} ({'predicted'})", ) ax.annotate( - f"{prettify(rel_error)}", + f"{abs(100 * rel_error):.1f}%", (f, y), textcoords="offset points", xytext=(3, 3), @@ -234,7 +233,7 @@ def main(): sns.set_style("whitegrid") num_tasks = len(args.keys) - num_cols = min(4, num_tasks) + num_cols = min(3, num_tasks) num_rows = (num_tasks + num_cols - 1) // num_cols fitting_error = 0 diff --git a/scripts/scaling/step2.py b/scripts/scaling/step2.py index e29fcb5f2..94dcb053d 100644 --- a/scripts/scaling/step2.py +++ b/scripts/scaling/step2.py @@ -1,5 +1,6 @@ -# python scripts/scaling/step2.py -k main -c scripts/scaling/step2.json -o figure/peteish-moreeval/step2_main.png -# python scripts/scaling/step2.py -k main_mc -c scripts/scaling/step2_mc.json -o figure/peteish-moreeval/step2_mc_main.png -y mc_acc +# python scripts/scaling/step2.py -k v2_main -c scripts/scaling/step2.json -o figure/peteish-moreeval/step2_main.png --skip_perc 0.1 --moving_avg 5 +# python scripts/scaling/step2.py -k v2_main -c scripts/scaling/step2.json -o figure/peteish-moreeval/step2_c4_main.png --x_metric c4 --skip_perc 0.1 --moving_avg 5 +# python scripts/scaling/step2.py -k v2_main_mc -c scripts/scaling/step2_mc.json -o figure/peteish-moreeval/step2_mc_main.png -y mc_acc import argparse @@ -26,6 +27,9 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--keys", nargs="+", default=[], help="Key(s) for tasks") + parser.add_argument( + "-x", "--x_metric", default="rc_bpb", choices=["rc_bpb", "c4"], help="Metric as input" + ) parser.add_argument( "-y", "--y_metric", default="rc_acc", choices=["rc_acc", "mc_acc"], help="Metric to predict" ) @@ -56,16 +60,19 @@ def fit_step2(data_by_name, task_name, y_metric): # add ideal points (these are not plotted) train_xs.append(0.0) train_ys.append(tasks[task_name].task_maximum) - train_xs.append(max(train_xs)) - train_ys.append(tasks[task_name].task_minimum) + # train_xs.append(max(train_xs)) + # train_ys.append(tasks[task_name].task_minimum) # fit the parameters coefficients, cov = get_coefficients( train_xs, train_ys, sigmoid, - p0=[tasks[task_name].task_minimum - 1.0, 0.9, 3.0, 1.0], + p0=[tasks[task_name].task_minimum - 1.0, 0.9, 3.0, tasks[task_name].task_maximum], bounds=([-1.0, 0.0, 0.0, 0.0], [0.0, np.inf, np.inf, 1.0]), + # bounds=([-np.inf, 0.0, 0.0, 0.0], [0.0, np.inf, np.inf, np.inf]), + # bounds=([tasks[task_name].task_minimum - 1.0, 0.0, 0.0, tasks[task_name].task_maximum - 0.0001], [tasks[task_name].task_minimum - 0.9999, np.inf, np.inf, tasks[task_name].task_maximum]), + # bounds=([-np.inf, 0.0, 0.0, tasks[task_name].task_maximum - 0.0001], [0.0, np.inf, np.inf, tasks[task_name].task_maximum]), disp=False, return_cov=True, ) @@ -105,6 +112,7 @@ def plot_step2( plotted_predicted_data, task_name, fit_str, + x_metric, y_metric, coefficients, cov, @@ -173,14 +181,19 @@ def plot_step2( ax.fill_between(plotted_predicted_data["xs"], plotted_y_lower, plotted_y_upper, color="pink", alpha=0.3) ax.legend(loc="upper right", ncols=1, fontsize=8) - ax.set_xlabel("Task loss") + if x_metric == "rc_bpb": + ax.set_xlabel("Task loss") + elif x_metric == "c4": + ax.set_xlabel("C4 loss") + else: + raise ValueError(f"Invalid x_metric: {x_metric}") if y_metric == "rc_acc": ax.set_ylabel("Task RC accuracy") elif y_metric == "mc_acc": ax.set_ylabel("Task MC accuracy") else: raise ValueError(f"Invalid y_metric: {y_metric}") - ax.set_ylim([0, 1.0]) + # ax.set_ylim([0, 1.0]) ax.set_title( f"{task_name}\n{fit_str}\navg rel error on fitting = {avg_unsigned_rel_err * 100:.2f}%", fontsize=9, @@ -201,7 +214,7 @@ def main(): sns.set_style("whitegrid") num_tasks = len(args.keys) - num_cols = min(4, num_tasks) + num_cols = min(3, num_tasks) num_rows = (num_tasks + num_cols - 1) // num_cols fig, axes = plt.subplots(num_rows, num_cols, figsize=(3.75 * num_cols, 3.25 * num_rows), squeeze=False) @@ -209,7 +222,7 @@ def main(): for i, task_name in enumerate(args.keys): data_by_name = get_step2_data_by_name( - configs, task_name, y_metric=args.y_metric, moving_avg=args.moving_avg, skip_perc=args.skip_perc + configs, task_name, x_metric=args.x_metric, y_metric=args.y_metric, moving_avg=args.moving_avg, skip_perc=args.skip_perc ) coefficients, cov = fit_step2(data_by_name, task_name, args.y_metric) @@ -232,6 +245,7 @@ def main(): plotted_predicted_data, task_name, str_sigmoid(coefficients), + args.x_metric, args.y_metric, coefficients, cov,