diff --git a/olmo/scaling/scaling_laws/utils.py b/olmo/scaling/scaling_laws/utils.py index 929c51f66..388a133b2 100644 --- a/olmo/scaling/scaling_laws/utils.py +++ b/olmo/scaling/scaling_laws/utils.py @@ -705,6 +705,9 @@ def get_step1_data_by_name(configs, task_name, y_metric="rc_bpb", moving_avg=1): keys = task.get_accuracy_keys() elif y_metric == "c4": keys = ["eval/c4_en-validation/CrossEntropyLoss"] + elif y_metric == "rc_soft_log": + keys = task.get_accuracy_keys() + keys = [key.replace("/downstream/", "/downstream_soft_log/").replace("_len_norm", "_soft_log") for key in keys] else: raise ValueError(f"Invalid y_metric: {y_metric}") @@ -724,6 +727,8 @@ def get_step1_data_by_name(configs, task_name, y_metric="rc_bpb", moving_avg=1): y = np.average( [float(row[key]) for key in keys], weights=[WEIGHT_BY_KEY.get(key, 1.0) for key in keys] ) + if y_metric == "rc_soft_log": + y *= -1 ds.append(d) ys.append(y) fs.append(f) diff --git a/scripts/scaling/predict.py b/scripts/scaling/predict.py index 1229a5a7a..92a49cc0f 100644 --- a/scripts/scaling/predict.py +++ b/scripts/scaling/predict.py @@ -11,7 +11,7 @@ from step1 import fit_step1 from step2 import fit_step2 -from olmo.scaling.scaling_laws.fitting_functions import chinchilla_n_d_fit, sigmoid +from olmo.scaling.scaling_laws.fitting_functions import chinchilla_n_d_fit, sigmoid, log_sigmoid from olmo.scaling.scaling_laws.utils import ( get_final_configs, get_step1_data_by_name, @@ -25,7 +25,7 @@ def parse_args(): parser.add_argument( "-k", "--keys", nargs="+", default=[], help="For avg metrics. Use one of [all-val-lm, all-bpb]" ) - parser.add_argument("-x", "--x_metric", default="rc_bpb", choices=["rc_bpb", "c4"], help="Metric as input") + parser.add_argument("-x", "--x_metric", default="rc_bpb", choices=["rc_bpb", "c4", "rc_soft_log"], help="Metric as input") parser.add_argument( "-y", "--y_metric", default="rc_acc", choices=["rc_acc", "mc_acc"], help="Metric to predict" ) @@ -43,6 +43,7 @@ def parse_args(): parser.add_argument( "-t", "--target-name", type=str, default=None, help="Path to the csv file of the target model" ) + parser.add_argument("--use_log_sigmoid", action="store_true", help="Use log sigmoid for fitting") args = parser.parse_args() args.keys = get_task_sets(args.keys) @@ -76,11 +77,12 @@ def main(): moving_avg=args.moving_avg, skip_perc=args.skip_perc, ) - step2_coefficients, _ = fit_step2(step2_data_by_name, task_name, args.y_metric) + step2_coefficients, _ = fit_step2(step2_data_by_name, task_name, args.y_metric, args.use_log_sigmoid) # make predictions pred_loss = chinchilla_n_d_fit([args.n, args.d], step1_coefficients) - pred_acc = sigmoid(pred_loss, *step2_coefficients) + fit_fn = log_sigmoid if args.use_log_sigmoid else sigmoid + pred_acc = fit_fn(pred_loss, *step2_coefficients) if args.target_name: data = step2_data_by_name[args.target_name] actual_acc = data["ys"][-1] diff --git a/scripts/scaling/step1.py b/scripts/scaling/step1.py index c03d39679..9852d449f 100644 --- a/scripts/scaling/step1.py +++ b/scripts/scaling/step1.py @@ -1,6 +1,7 @@ # python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_main.pdf --moving_avg 5 # python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_c4_main.pdf --y_metric c4 --moving_avg 5 # python scripts/scaling/step1.py -k v2_main -c scripts/scaling/final.json -o figure/peteish-moreeval/step1_acc_main.pdf --y_metric rc_acc +# python scripts/scaling/step1.py -o figure/peteish-moreeval/step1_taskce.pdf -c scripts/scaling/step2.json -k v2_main -y rc_soft_log import argparse from typing import Any, List, Tuple @@ -31,7 +32,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("-k", "--keys", nargs="+", default=[], help="Key(s) for tasks") parser.add_argument( - "-y", "--y_metric", default="rc_bpb", choices=["rc_bpb", "rc_acc", "c4"], help="Metric to predict" + "-y", "--y_metric", default="rc_bpb", choices=["rc_bpb", "rc_acc", "c4", "rc_soft_log"], help="Metric to predict" ) parser.add_argument("--moving_avg", type=int, default=1, help="Moving average for bpb loss") parser.add_argument("-c", "--config-path", type=str, required=True, help="Path to config file") @@ -54,7 +55,7 @@ def fit_step1(data_by_name, y_metric): bounds: List[Tuple[Any, Any]] - if y_metric == "rc_bpb" or y_metric == "c4": + if y_metric == "rc_bpb" or y_metric == "c4" or y_metric == "rc_soft_log": p0 = [3.0, 6.0, 0.1, 0.2, 1.0] bounds = [(0, None), (0, None), (0, None), (0, None), (0, None)] # p0 = [3.0, 6.0, 0.25, 0.3, 1.0] @@ -112,7 +113,7 @@ def predict_step1(configs, data_by_name, coefficients, y_metric): dmin = 0.8 * min([min(data["ds"]) for data in data_by_name.values()]) dmax = 1.5 * max([max(data["ds"]) for data in data_by_name.values()]) - if y_metric == "rc_bpb" or y_metric == "c4": + if y_metric == "rc_bpb" or y_metric == "c4" or y_metric == "rc_soft_log": func = chinchilla_n_d_fit elif y_metric == "rc_acc": func = chinchilla_n_d_negated_fit @@ -251,14 +252,13 @@ def plot_step1( ax.legend(loc="upper right", ncols=1, fontsize=FONTSIZE) ax.set_xlabel("Tokens (D)", fontsize=FONTSIZE) - if y_metric == "rc_bpb": - ax.set_ylabel("Task loss", fontsize=FONTSIZE) - elif y_metric == "rc_acc": - ax.set_ylabel("Task RC accuracy", fontsize=FONTSIZE) - elif y_metric == "c4": - ax.set_ylabel("C4 loss", fontsize=FONTSIZE) - else: - raise ValueError(f"Unknown y_metric: {y_metric}") + y_label_name = { + "rc_bpb": "Task loss", + "rc_acc": "Task RC accuracy", + "c4": "C4 loss", + "rc_soft_log": "TaskCE", + }[y_metric] + ax.set_ylabel(y_label_name, fontsize=FONTSIZE) ax.set_title( f"{tasks[task_name].display_name} ({avg_unsigned_rel_error * 100:.2f}%)", fontsize=FONTSIZE, diff --git a/scripts/scaling/step2.py b/scripts/scaling/step2.py index 8be971482..d47fb9d71 100644 --- a/scripts/scaling/step2.py +++ b/scripts/scaling/step2.py @@ -1,6 +1,7 @@ # python scripts/scaling/step2.py -k v2_main -c scripts/scaling/step2.json -o figure/peteish-moreeval/step2_main.pdf --skip_perc 0.1 --moving_avg 5 # python scripts/scaling/step2.py -k v2_main -c scripts/scaling/step2.json -o figure/peteish-moreeval/step2_c4_main.pdf --x_metric c4 --skip_perc 0.1 --moving_avg 5 # python scripts/scaling/step2.py -k mmlu_avg_test_5shot -c scripts/scaling/step2_mc.json -o figure/peteish-moreeval/step2_mc_mmlu.pdf -y mc_acc +# python scripts/scaling/step2.py -o figure/peteish-moreeval/step2_taskce.pdf -c scripts/scaling/step2.json -k v2_main --skip_perc 0.5 --use_log_sigmoid --x_metric rc_soft_log import argparse @@ -61,10 +62,11 @@ def fit_step2(data_by_name, task_name, y_metric, use_log_sigmoid=False): data["ys"] = data["ys"][-1:] # add ideal points (these are not plotted) - train_xs.append(0.0) - train_ys.append(tasks[task_name].task_maximum) - # train_xs.append(max(train_xs)) - # train_ys.append(tasks[task_name].task_minimum) + if not use_log_sigmoid: + train_xs.append(0.0) + train_ys.append(tasks[task_name].task_maximum) + # train_xs.append(max(train_xs)) + # train_ys.append(tasks[task_name].task_minimum) # fit the parameters if use_log_sigmoid: @@ -96,6 +98,8 @@ def predict_step2(configs, data_by_name, coefficients, cov, y_metric, use_log_si fit_fn = log_sigmoid_fit if use_log_sigmoid else sigmoid_fit grad_fit_fn = grad_log_sigmoid_fit if use_log_sigmoid else grad_sigmoid_fit + all_rel_errors = [] + predicted_data_by_name = {} for name, data in data_by_name.items(): config = configs[name] @@ -109,6 +113,8 @@ def predict_step2(configs, data_by_name, coefficients, cov, y_metric, use_log_si std_error = get_std_errors([x], [y_pred], coefficients, cov, fit_fn, grad_fit_fn)[0] delta_error = 1.96 * std_error + all_rel_errors.append(rel_error) + xmin = min(min(data["xs"]) for data in data_by_name.values()) xmax = max(max(data["xs"]) for data in data_by_name.values()) xmin = xmin - 0.2 * (xmax - xmin) @@ -119,7 +125,7 @@ def predict_step2(configs, data_by_name, coefficients, cov, y_metric, use_log_si "ys": [predict_fn(x, *coefficients) for x in xs], } - return predicted_data_by_name, plotted_predicted_data, (y, y_pred, rel_error, delta_error) + return predicted_data_by_name, plotted_predicted_data, (y, y_pred, rel_error, delta_error), all_rel_errors def plot_step2( @@ -259,6 +265,7 @@ def main(): results = "Task Name | Actual Value | Predicted Value | Relative Error" + rel_errors = [] for i, task_name in enumerate(args.keys): data_by_name = get_step2_data_by_name( configs, @@ -273,9 +280,10 @@ def main(): # a, x0, k, b = coefficients # make predictions - predicted_data_by_name, plotted_predicted_data, (y, y_pred, rel_error, delta_error) = predict_step2( + predicted_data_by_name, plotted_predicted_data, (y, y_pred, rel_error, delta_error), all_rel_errors = predict_step2( configs, data_by_name, coefficients, cov, y_metric=args.y_metric, use_log_sigmoid=args.use_log_sigmoid ) + rel_errors += all_rel_errors str_formula = str_sigmoid(coefficients, use_log_sigmoid=args.use_log_sigmoid) results += f"\n{task_name} | {prettify(y, False)} | {prettify(y_pred, False)} | {prettify(rel_error)} | {str_formula}" @@ -298,6 +306,8 @@ def main(): ax=ax, ) + print(f"Mean relative error: {np.mean(np.abs(rel_errors)) * 100:.2f}%") + handles, labels = axes[-1][-1].get_legend_handles_labels() # delete x-axis labels for all but the bottom row for i in range(num_cols):