diff --git a/olmo/scaling/scaling_laws/stacked_predictions.py b/olmo/scaling/scaling_laws/stacked_predictions.py deleted file mode 100644 index 4d8e34865..000000000 --- a/olmo/scaling/scaling_laws/stacked_predictions.py +++ /dev/null @@ -1,515 +0,0 @@ -import csv -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt -from scipy.optimize import curve_fit - -from olmo.scaling.scaling_laws.utils import ( - FinalConfig, - chinchilla_n_d_fit, - get_coefficients_huber, - grad_chinchilla_n_d_fit, -) - -# Fitting functions -from olmo.util import StrEnum - - -def sigmoid(x, L, x0, k, b): - o = L / (1 + np.exp(-k * (x - x0))) + b - return o - - -def reverse_sigmoid(y, L, x0, k, b): - return x0 - 1 / k * np.log((L / (y - b)) - 1) - - -# Error with using huber fit; possibly due to incorrect bounds (try later). - -# def sigmoid_fit(x, p): -# return p[0] / (1 + np.exp(-p[2] * (x - p[1]))) + p[3] - -# def grad_sigmoid_fit(x, p): -# grad_L = 1 / (1 + np.exp(-p[2] * (x - p[1]))) -# grad_x0 = p[0] * p[2] * sigmoid_fit(x, p) * (1 - sigmoid_fit(x, p)) -# grad_k = p[0] * (x - p[1]) * sigmoid_fit(x, p) * (1 - sigmoid_fit(x, p)) -# grad_b = 1 -# return [grad_L, grad_x0, grad_k, grad_b] - -# # fit the parameters -# coefficients = get_coefficients_huber( -# train_nds, -# train_ys, -# sigmoid_fit, -# grad_sigmoid_fit, -# p0=[-1.3, 0.5, 3, 0.3], -# bounds=None, #[(None, 0), (None, None), (None, None), (None, None)], -# ) - - -BASELINE_BY_TASK_NAME = { - "HellaSwag-0shot": 0.25, - "MMLU-Var": 0.25, - "HellaSwag-5shot": 0.25, - "ARC-Easy-5shot": 0.25, - "ARC-Challenge-5shot": 0.25, - "PiQA-5shot": 0.5, - "Winogrande-5shot": 0.5, - "OpenbookQA-5shot": 0.25, - "SciQ-0shot": 0.25, - "Copa-0shot": 0.5, - "CSQA-5shot": 0.2, - "SocialIQA-5shot": 1 / 3, -} - - -def get_all_data_by_name(configs, keys) -> Dict: - data_by_name: Dict = defaultdict(lambda: defaultdict(lambda: [])) - for name, config in configs.items(): - for path in config.paths: - with open(path) as file_ref: - reader = csv.DictReader(file_ref) - rows = [row for row in reader] - for row in rows: - y = np.mean([float(row[key]) for key in keys]) - data_by_name[name][path].append(y) - return data_by_name - - -def size_length_from_path(path): - name = path.split("/")[-1].strip(".csv") - return name.split("-")[:2] - - -def get_dataframe_from_configs( - x_dict: Dict[str, Dict], - y_dict: Dict[str, Dict], - configs: Dict[str, FinalConfig], -) -> pd.DataFrame: - df = pd.DataFrame() - xs = [] - ys = [] - params = [] - sizes = [] - lengths = [] - modes = [] - runs = [] - colors = [] - for name, path_dict in x_dict.items(): - config = configs[name] - for path in path_dict: - size, length = size_length_from_path(path) - run_name = f"{size}-{length}" - x_data = x_dict[name][path] - y_data = y_dict[name][path] - xs += x_data - ys += y_data - params += [config.n for _ in range(len(x_data))] - sizes += [size for _ in range(len(x_data))] - lengths += [length for _ in range(len(x_data))] - modes += [config.mode for _ in range(len(x_data))] - runs += [run_name for _ in range(len(x_data))] - colors += [config.color for _ in range(len(x_data))] - - df["x"] = xs - df["y"] = ys - df["params"] = params - df["size"] = sizes - df["length"] = lengths - df["mode"] = modes - df["run"] = runs - df["color"] = colors - return df - - -def get_predicted_error(df): - eval_row = df[df["mode"] == "eval"].iloc[-1] - y = eval_row["y"] - y_pred = eval_row["predicted_y"] - rel_error = (y_pred - y) / y - # rel_error = f"{rel_error * 100:+.1f}%" - return rel_error - - -def fit_step1(df: pd.DataFrame): - df = df.dropna() - - # Fit - train_nds = list(df[df["mode"] == "train"][["params", "x"]].itertuples(index=False, name=None)) - train_ys = df[df["mode"] == "train"]["y"] - - # fit the parameters - coefficients = get_coefficients_huber( - train_nds, - train_ys, - chinchilla_n_d_fit, - grad_chinchilla_n_d_fit, - p0=[3.0, 6.0, 0.1, 0.2, 1.0], - bounds=[(0, None), (0, None), (0, None), (0, None), (0, None)], - disp=False, - ) - - df["predicted_y"] = df.apply(lambda x: chinchilla_n_d_fit([x.params, x.x], coefficients), axis=1) - return df, coefficients - - -def predict_step1(n: int, d: int, coefficients: List[float]): - return chinchilla_n_d_fit([n, d], coefficients) - - -def plot_step1( - df, coefficients, ax, x_label=None, y_label=None, title="Fitting final score", do_label=True, logscale=False -): - # a, b, alpha, beta, E = coefficients - # A, B = np.exp(a), np.exp(b) - - eval_row = df[df["mode"] == "eval"].iloc[-1] - x = eval_row["x"] - y = eval_row["y"] - y_pred = eval_row["predicted_y"] - rel_error = (y_pred - y) / y - run_name = eval_row["run"] - - for label in df["size"].unique(): - adf = df[df["size"] == label] - ax.scatter( - adf["x"], adf["y"], color="white", edgecolors=adf["color"], s=7.0, label=label if do_label else None - ) - - ax.scatter(x, y, marker="x", color="blue", label=f"actual ({run_name})= {y:0.4f}" if do_label else None, s=50) - ax.scatter( - x, - y_pred, - marker="^", - color="black", - label=f"predicted ({run_name}) = {y_pred:0.4}" if do_label else None, - s=50, - ) - ax.annotate( - f"{eval_row['run']}: {rel_error * 100:+.1f}%", - (x, y), - textcoords="offset points", - xytext=(10, 5), - ha="center", - fontsize=10, - color="brown", - ) - - for params in df["params"].unique(): - plotted_xs = np.linspace(df[df["params"] == params]["x"].max(), df[df["params"] == params]["x"].min(), 100) - plotted_ys = [chinchilla_n_d_fit([params, x_val], coefficients) for x_val in plotted_xs] - - ax.plot( - plotted_xs, - plotted_ys, - color="black", - linestyle="--", - linewidth=0.8, - ) - - # ax.text( - # x=0.25, - # y=0.50, - # s=f"L(n, d) = {A:.2f} / n^{alpha:.2f} + {B:.2f} / d^{beta:.2f} + {E:.2f}", - # fontsize=10, - # transform=ax.transAxes, - # ) - - if do_label: - ax.legend(loc="upper right", ncols=1) - - if logscale: - ax.set_xscale("log") - - ax.set_xlabel(x_label) - ax.set_ylabel(y_label) - ax.set_title(title) - - -def fit_step2(df: pd.DataFrame, baseline: float, add_ideal_points: bool = True): - df = df.dropna() - - # Fit - - train_xs = df[df["mode"] == "train"]["x"] - train_ys = df[df["mode"] == "train"]["y"] - - if add_ideal_points: - train_xs = pd.concat([pd.Series([0.0001]), train_xs, pd.Series([2.6])], ignore_index=True) - train_ys = pd.concat([pd.Series([1.0]), train_ys, pd.Series([baseline])], ignore_index=True) - - coefficients, pcov = curve_fit(sigmoid, train_xs, train_ys, p0=[baseline - 1.0, 0.9, 3.0, 1.0], maxfev=1000000) - - df["predicted_y"] = df["x"].apply(lambda x: sigmoid(x, *coefficients)) - - return df, coefficients - - -def predict_step2(bpb_loss: float, coefficients: List[float]): - return sigmoid(bpb_loss, *coefficients) - - -def plot_step2( - df, - coefficients, - ax, - x_label=None, - y_label=None, - title="Fitting final score", - add_ideal_points=True, - do_label=True, -): - eval_row = df[df["mode"] == "eval"].iloc[-1] - x = eval_row["x"] - y = eval_row["y"] - y_pred = eval_row["predicted_y"] - rel_error = (y_pred - y) / y - run_name = eval_row["run"] - - for label in df["size"].unique(): - adf = df[df["size"] == label] - ax.scatter( - adf["x"], adf["y"], color="white", edgecolors=adf["color"], s=7.0, label=label if do_label else None - ) - - ax.scatter(x, y, marker="x", color="blue", label=f"actual ({run_name}) = {y:0.4f}" if do_label else None, s=50) - ax.scatter( - x, - y_pred, - marker="^", - color="black", - label=f"predicted ({run_name}) = {y_pred:0.4}" if do_label else None, - s=50, - ) - ax.annotate( - f"{eval_row['run']}: {rel_error * 100:+.1f}%", - (x, y), - textcoords="offset points", - xytext=(30, 5), - ha="center", - fontsize=10, - color="brown", - ) - - if add_ideal_points: - plotted_xs = np.linspace(max(2.6, df["x"].max()), 0.01, 100) - else: - plotted_xs = np.linspace(df["x"].max(), df["x"].min(), 100) - plotted_ys = [sigmoid(x_val, *coefficients) for x_val in plotted_xs] - - ax.plot( - plotted_xs, - plotted_ys, - color="black", - linestyle="--", - linewidth=0.8, - ) - - # L, x0, k, b = coefficients - # print(f"σ(L, x0, k, b) \n = {L:.2f} / (1 + e^(-({k:.2f}(x - {x0:.2f})))) + {b:.2f}") - # ax.text( - # x=0.25, - # y=0.50, - # s=f"σ(L, x0, k, b) \n = {L:.2f} / (1 + e^(-({k:.2f}(x - {x0:.2f})))) + {b:.2f}", - # fontsize=10, - # transform=plt.gca().transAxes, - # ) - - if do_label: - ax.legend(loc="upper right", ncols=1) - - ax.set_xlabel(x_label) - ax.set_ylabel(y_label) - ax.set_title(title) - - -def plot_stacked( - df, step2_df, ax, x_label=None, y_label=None, title=None, do_label=True, do_grey=False, logscale=False -): - mode_colors = {"train": "grey", "eval": "lightgrey"} - - for label in df["size"].unique(): - adf = df[df["size"] == label] - ax.scatter( - adf["x"], - adf["y"], - color="white", - edgecolors=adf["mode"].apply(lambda x: mode_colors[x]) if do_grey else adf["color"], - s=7.0, - label=label, - ) - - step2_df = ( - pd.merge(df.reset_index()[["index", "x"]], step2_df, left_on="index", right_on="level_1", how="inner") - .rename({"x_x": "tokens", "x_y": "x"}, axis=1) - .drop("index", axis=1) - .drop("level_1", axis=1) - ) - eval_row = step2_df[step2_df["mode"] == "eval"].iloc[-1] - x = eval_row["tokens"] - y = eval_row["y"] - y_pred = eval_row["predicted_y"] - rel_error = (y_pred - y) / y - - ax.scatter(x, y, marker="x", color="blue", label=f"actual = {y:0.4f}", s=100) - ax.scatter(x, y_pred, marker="^", color="black", label=f"predicted = {y_pred:0.4}", s=100) - ax.annotate( - f"{eval_row['run']}: {rel_error * 100:+.1f}%", - (x, y), - textcoords="offset points", - xytext=(30, -30), - ha="center", - fontsize=10, - color="brown", - ) - - if do_label: - ax.legend(loc="lower right", ncols=1) - - if logscale: - ax.set_xscale("log") - ax.set_xlabel(x_label or "tokens") - ax.set_ylabel(y_label or "accuracy") - ax.set_title(title or "stacked prediction") - - -class DownstreamPredictionFeatures(StrEnum): - raw = "raw" - moving_average = "moving_average" # TODO: how to specify window size? - exponential_moving_average = "exponential_moving_average" # TODO: how to specify alpha. - - -def apply_moving_average(step_df, column: str, window: int = 20): - return step_df.groupby("run")[column].transform(lambda x: x.rolling(window=window).mean()) - - -def apply_exponential_moving_average(step_df, column: str, alpha: float = 0.5): - return step_df.groupby("run")[column].transform(lambda x: x.ewm(alpha=alpha).mean()) - - -def get_downstream_predictions( - configs: Dict[str, FinalConfig], - tasks: Dict, - feature_type: DownstreamPredictionFeatures = DownstreamPredictionFeatures.raw, - use_last_n_points_step1: int = 1, - use_last_n_percentage: float = 1.0, - *, - save_figures: Optional[str] = None, - target_n_d: Optional[Tuple[int, int]] = None, - **feature_kwargs, -): - assert 0.0 <= use_last_n_percentage <= 1.0 - do_plot = save_figures is not None - - if do_plot: - rows = len(tasks.keys()) - fig, axes = plt.subplots(rows, 3, figsize=(20, 5 * rows)) - - no_error = target_n_d is not None - - if not no_error: - target = [run_name for run_name in configs if configs[run_name].mode == "eval"][0] - step1_error: Dict = {target: {}} - stacked_error: Dict = {target: {}} - else: - target = "_".join([str(x) for x in target_n_d]) - - step1_predictions: Dict = {target: {}} - stacked_predictions: Dict = {target: {}} - - for i, (task_name, task) in enumerate(tasks.items()): - tokens = get_all_data_by_name(configs, ["throughput/total_tokens"]) - bpb_loss = get_all_data_by_name(configs, task["bpb"]) - downstream_loss = get_all_data_by_name(configs, task["score"]) - - step1_df = get_dataframe_from_configs(tokens, bpb_loss, configs) - - if feature_type == DownstreamPredictionFeatures.moving_average: - step1_df["y"] = apply_moving_average(step1_df, "y", **feature_kwargs) - elif feature_type == DownstreamPredictionFeatures.exponential_moving_average: - step1_df["y"] == apply_exponential_moving_average(step1_df, "y", **feature_kwargs) - - step1_df = step1_df.groupby("run").apply(lambda rows: rows.iloc[-use_last_n_points_step1:], include_groups=False).reset_index() - step1_df, coefficients = fit_step1(step1_df) - - if not no_error: - target_n_d = [ - step1_df[step1_df["mode"] == "eval"].params.iloc[0], - step1_df[step1_df["mode"] == "eval"].x.iloc[0], - ] - - step1_predictions[target][task_name] = predict_step1(*target_n_d, coefficients) - - if do_plot: - plot_step1( - step1_df, - coefficients, - axes[i][0], - x_label="tokens", - y_label="task loss", - title=f"predicting task_loss ({task_name})", - do_label=True, - logscale=True, - ) - - step2_df = get_dataframe_from_configs(bpb_loss, downstream_loss, configs) - - step2_df = ( - step2_df.groupby("run") - .apply(lambda x: x.iloc[-int(np.ceil(use_last_n_percentage * len(x))) :], include_groups=False) - .reset_index() - ) - - if feature_type == DownstreamPredictionFeatures.moving_average: - step2_df["x"] = apply_moving_average(step2_df, "x", **feature_kwargs) - elif feature_type == DownstreamPredictionFeatures.exponential_moving_average: - step2_df["x"] == apply_exponential_moving_average(step2_df, "x", **feature_kwargs) - - last_match_idx = step2_df.loc[step2_df["mode"] == "eval"].tail(1).index - step2_df.loc[last_match_idx, "x"] = step1_predictions[target][task_name] - - step2_df, coefficients = fit_step2(step2_df, tasks[task_name]["baseline"]) - - stacked_predictions[target][task_name] = predict_step2(step1_predictions[target][task_name], coefficients) - - if do_plot: - plot_step2( - step2_df, - coefficients, - axes[i][1], - x_label="task loss", - y_label="task accuracy", - title=f"predicting task_accuracy ({task_name})", - do_label=True, - ) - - if not no_error: - step1_error[target][task_name] = get_predicted_error(step1_df) - stacked_error[target][task_name] = get_predicted_error(step2_df) - - if do_plot: - df = get_dataframe_from_configs(tokens, downstream_loss, configs) - plot_stacked( - df, - step2_df, - axes[i][2], - title=f"Stacked predictions using {feature_type} ({feature_kwargs})", - do_label=True, - do_grey=False, - logscale=True, - ) - - if do_plot: - fig.suptitle("Combined 2-step downstream predictions", fontsize=12) - fig.tight_layout() - fig.subplots_adjust(top=0.95) - fig.savefig(save_figures, dpi=300) - # plt.close() - - if not no_error: - return step1_predictions, stacked_predictions, step1_error, stacked_error - else: - return step1_predictions, stacked_predictions diff --git a/scripts/stacked_predictions.py b/scripts/stacked_predictions.py deleted file mode 100644 index 7fd29c80a..000000000 --- a/scripts/stacked_predictions.py +++ /dev/null @@ -1,214 +0,0 @@ -import argparse -import json - -from typing import Dict - -import numpy as np -import pandas as pd - -from olmo.scaling.scaling_laws.stacked_predictions import ( - DownstreamPredictionFeatures, - get_downstream_predictions, -) -from olmo.scaling.scaling_laws.utils import FinalConfig - -import ladder_peteish as ladder - -# We only include ce loss and the 6 dolma sets, as these are the sets we can include in the paper -ce_columns = [ - "eval/c4_en-validation/CrossEntropyLoss", - "eval/dolma_books-validation/CrossEntropyLoss", - "eval/dolma_common-crawl-validation/CrossEntropyLoss", - "eval/dolma_pes2o-validation/CrossEntropyLoss", - "eval/dolma_reddit-validation/CrossEntropyLoss", - "eval/dolma_stack-validation/CrossEntropyLoss", - "eval/dolma_wiki-validation/CrossEntropyLoss", -] - -mmlu_names = ["mmlu_stem", "mmlu_humanities", "mmlu_social_sciences", "mmlu_other"] -# mmlu_names = ["mmlu_humanities", "mmlu_social_sciences", "mmlu_other"] - -main_tasks = ["hellaswag", "arc_easy", "arc_challenge", "piqa", "openbookqa", "csqa", "socialiqa"] - -baselines_rc_5shot = { - "piqa": 0.5, - "socialiqa": 1 / 3, - "csqa": 0.2, -} - -baselines_mc_5shot = { - "piqa": 0.5, - "socialiqa": 1 / 3, - "csqa": 0.2, -} - -tasks_rc_5shot = { - f"{key}_rc_5shot": { - "bpb": [f"eval/downstream_bpb/{key}_rc_5shot_bpb_bpb"], - "score": [ - f"eval/downstream/{key}_rc_5shot_len_norm" - if key not in ["arc_easy"] - else f"eval/downstream/{key}_rc_5shot_acc" - ], - "baseline": baselines_rc_5shot.get(key, 0.25), - } - for key in main_tasks -} - -tasks_mmlu_var = { - f"{key}_var": { - "bpb": [f"eval/downstream_bpb/{key}_var_bpb_bpb"], - "score": [f"eval/downstream/{key}_var_len_norm"], - "baseline": 0.25, - } - for key in mmlu_names -} - -tasks = {**tasks_rc_5shot, **tasks_mmlu_var} - - -def prettify(rel_error, is_percentage=True): - if is_percentage: - return f"{rel_error * 100:+.1f}%" - else: - return f"{rel_error:.2f}" - - -def make_parser(): - parser = argparse.ArgumentParser( - description="Get downstream predictions for a target model, based on model ladder outputs." - ) - # TODO: Give an example. - - parser.add_argument("config_path", help="Path to config specifying the input and target model runs.") - - parser.add_argument( - "--save_figures", - type=str, - help="Use this to specify a png path for saving the plots for fitted curves. If not specified, plots will not be created", - default=None, - ) - - parser.add_argument( - "--use_last_n_points_step1", - type=int, - default=1, - help="Optionally extend the number of training points for step 1 to last n (default=1)", - ) - - parser.add_argument( - "--use_last_n_percentage", - type=float, - default=1.0, - help="Optionally limit the number of training points to last n percentage for the sigmoid fit (float; 0.02 is last 2%)", - ) - - parser.add_argument( - "--feature_type", - type=str, - default=DownstreamPredictionFeatures.raw, - help="{raw, moving_average, exponential_moving_average}", - ) - - parser.add_argument("--feature_kwargs", type=str, default="{}", help="Eg. {'window': 20}") - - parser.add_argument( - "--target_n", type=str, default="", help="Target number of parameters to predict for. Use with `target_d`" - ) - parser.add_argument( - "--target_d", type=str, default="", help="Target number of tokens to predict for. Use with `target_n`" - ) - - return parser - -def save_predictions(output_path: str, target_n: int, step1_predictions: Dict, stacked_predictions: Dict): - - save_dict = {} - save_dict["throughput/total_tokens"] = target_n - - for key, val in list(step1_predictions.values())[0].items(): - save_dict[tasks[key]["bpb"][0]] = val - for key, val in list(stacked_predictions.values())[0].items(): - save_dict[tasks[key]["score"][0]] = val - - df = pd.DataFrame([save_dict]) - df.to_csv(output_path, index=False) - - -def main(): - parser = make_parser() - args = parser.parse_args() - - # if args.save_figures is not None: - # os.makedirs(args.save_figures, exist_ok=True) - - with open(args.config_path) as f: - configs = json.load(f) - configs = {name: FinalConfig(**config) for name, config in configs.items()} - - feature_kwargs = json.loads(args.feature_kwargs) - - if args.target_n != "" and args.target_d != "": - no_error = True - model_size = ladder.parse_size(args.target_n) - model_length = ladder.parse_length(args.target_d, model_size) - target_n_d = [model_size, model_length] - step1_predictions, stacked_predictions = get_downstream_predictions( - configs, - tasks, - args.feature_type, - args.use_last_n_points_step1, - args.use_last_n_percentage, - save_figures=args.save_figures, - target_n_d=target_n_d, - **feature_kwargs, - ) - else: - no_error = False - step1_predictions, stacked_predictions, step1_error, stacked_error = get_downstream_predictions( - configs, - tasks, - args.feature_type, - args.use_last_n_points_step1, - args.use_last_n_percentage, - save_figures=args.save_figures, - **feature_kwargs, - ) - - mkdn = """| Task | Step1 prediction | Stacked prediction |\n| --- | --- |""" - - for task in tasks: - mkdn += f"\n| {task} |" - for target in stacked_predictions: - mkdn += f"{prettify(step1_predictions[target][task], False)} | {prettify(stacked_predictions[target][task], False)} |" - - print(mkdn) - print() - - if not no_error: - mkdn = """| Task | Step1 error | Stacked error |\n| --- | --- |""" - - for task in tasks: - mkdn += f"\n| {task} |" - for target in stacked_error: - mkdn += f"{prettify(step1_error[target][task])} | {prettify(stacked_error[target][task])} |" - - mkdn += "\n| **Avg signed error** | " - for target in stacked_error: - mkdn += f"**{prettify(np.mean(list(step1_error[target].values())))}** | **{prettify(np.mean(list(stacked_error[target].values())))}** |" - - mkdn += "\n| **Avg unsigned error** | " - for target in stacked_error: - mkdn += f"**{prettify(np.mean(np.abs(list(step1_error[target].values()))))}** | **{prettify(np.mean(np.abs(list(stacked_error[target].values()))))}** |" - print(mkdn) - - - # do_save_predictions = False - # if do_save_predictions: - # save_predictions(f"wandb/peteish-final-new/{args.target_n}-{args.target_d}.csv", model_size, step1_predictions, stacked_predictions) - - - - -if __name__ == "__main__": - main()