diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py
index 2a9d1d365..a53c875c1 100644
--- a/olmo/eval/downstream.py
+++ b/olmo/eval/downstream.py
@@ -1863,3 +1863,502 @@ def doc_to_label(self, doc) -> int:
         {"dataset_path": "winogrande", "dataset_name": "rc_5shot", "metric_type": "bpb"},
     ),
 }
+
+# This standardizes the metrics we should eval for the ladder.
+# Train and test sets are added when applicable.
+label_to_task_map_new = {
+    "arc_challenge_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "arc_challenge_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_challenge_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_challenge_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_challenge_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "arc_challenge_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_challenge_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_challenge_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_challenge_test_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "arc_challenge_test_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "test_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_challenge_test_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_challenge_test_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_challenge", "dataset_name": "test_mc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),  # this used to be acc
+    "arc_easy_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_easy_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "arc_easy_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_easy_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_test_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "arc_easy_test_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "bpb"},
+    ),
+    "arc_easy_test_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
+    ),
+    "arc_easy_test_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "arc_easy", "dataset_name": "test_mc_5shot", "metric_type": "bpb"},
+    ),
+    "boolq_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),  # this used to be acc
+    "boolq_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "boolq_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "boolq_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "boolq_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "boolq_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "boolq_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "boolq_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "boolq", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "csqa_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "csqa_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "csqa_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "csqa_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "csqa_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "csqa_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "csqa_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "csqa_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "csqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "hellaswag_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "hellaswag_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "hellaswag_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "hellaswag_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "hellaswag_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "hellaswag_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "hellaswag_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "hellaswag_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "hellaswag", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "openbookqa_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "openbookqa_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "openbookqa_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "openbookqa_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_test_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "openbookqa_test_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "bpb"},
+    ),
+    "openbookqa_test_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "test_mc_5shot", "metric_type": "acc"},
+    ),
+    "openbookqa_test_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "openbookqa", "dataset_name": "test_mc_5shot", "metric_type": "bpb"},
+    ),
+    "piqa_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "piqa_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "piqa_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "piqa_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "piqa_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "piqa_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "piqa_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "piqa_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "piqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "socialiqa_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "socialiqa_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "socialiqa_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "socialiqa_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "socialiqa_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "socialiqa_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "socialiqa_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "socialiqa_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "socialiqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "winogrande_train_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"},
+    ),  # this used to be acc
+    "winogrande_train_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "train_rc_5shot", "metric_type": "bpb"},
+    ),
+    "winogrande_train_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "train_mc_5shot", "metric_type": "acc"},
+    ),
+    "winogrande_train_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "train_mc_5shot", "metric_type": "bpb"},
+    ),
+    "winogrande_val_rc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"},
+    ),
+    "winogrande_val_rc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "bpb"},
+    ),
+    "winogrande_val_mc_5shot": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "val_mc_5shot", "metric_type": "acc"},
+    ),
+    "winogrande_val_mc_5shot_bpb": (
+        OEEvalTask,
+        {"dataset_path": "winogrande", "dataset_name": "val_mc_5shot", "metric_type": "bpb"},
+    ),
+    "mmlu_stem_val_rc_var": (MMLU, {"dataset_name": "stem", "prompt_variations": 1}),
+    "mmlu_stem_val_rc_var_bpb": (MMLU, {"dataset_name": "stem", "prompt_variations": 1, "metric_type": "bpb"}),
+    "mmlu_stem_val_rc_5shot": (MMLU, {"dataset_name": "stem", "prompt_variations": 2}),
+    "mmlu_stem_val_rc_5shot_bpb": (MMLU, {"dataset_name": "stem", "prompt_variations": 2, "metric_type": "bpb"}),
+    "mmlu_stem_val_mc_5shot": (MMLU, {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True}),
+    "mmlu_stem_val_mc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"},
+    ),
+    "mmlu_stem_test_rc_var": (MMLU, {"dataset_name": "stem", "split": "test", "prompt_variations": 1}),
+    "mmlu_stem_test_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_stem_test_rc_5shot": (MMLU, {"dataset_name": "stem", "split": "test", "prompt_variations": 2}),
+    "mmlu_stem_test_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_stem_test_mc_5shot": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_stem_test_mc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_val_rc_var": (MMLU, {"dataset_name": "humanities", "prompt_variations": 1}),
+    "mmlu_humanities_val_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_val_rc_5shot": (MMLU, {"dataset_name": "humanities", "prompt_variations": 2}),
+    "mmlu_humanities_val_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_val_mc_5shot": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_humanities_val_mc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_test_rc_var": (MMLU, {"dataset_name": "humanities", "split": "test", "prompt_variations": 1}),
+    "mmlu_humanities_test_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "humanities", "split": "test", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_test_rc_5shot": (
+        MMLU,
+        {"dataset_name": "humanities", "split": "test", "prompt_variations": 2},
+    ),
+    "mmlu_humanities_test_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "humanities", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_humanities_test_mc_5shot": (
+        MMLU,
+        {"dataset_name": "humanities", "split": "test", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_humanities_test_mc_5shot_bpb": (
+        MMLU,
+        {
+            "dataset_name": "humanities",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "metric_type": "bpb",
+        },
+    ),
+    "mmlu_social_sciences_val_rc_var": (MMLU, {"dataset_name": "social_sciences", "prompt_variations": 1}),
+    "mmlu_social_sciences_val_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_val_rc_5shot": (MMLU, {"dataset_name": "social_sciences", "prompt_variations": 2}),
+    "mmlu_social_sciences_val_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_val_mc_5shot": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_social_sciences_val_mc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "social_sciences", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_test_rc_var": (
+        MMLU,
+        {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 1},
+    ),
+    "mmlu_social_sciences_test_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_test_rc_5shot": (
+        MMLU,
+        {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2},
+    ),
+    "mmlu_social_sciences_test_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_social_sciences_test_mc_5shot": (
+        MMLU,
+        {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_social_sciences_test_mc_5shot_bpb": (
+        MMLU,
+        {
+            "dataset_name": "social_sciences",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "metric_type": "bpb",
+        },
+    ),
+    "mmlu_other_val_rc_var": (MMLU, {"dataset_name": "other", "prompt_variations": 1}),
+    "mmlu_other_val_rc_var_bpb": (MMLU, {"dataset_name": "other", "prompt_variations": 1, "metric_type": "bpb"}),
+    "mmlu_other_val_rc_5shot": (MMLU, {"dataset_name": "other", "prompt_variations": 2}),
+    "mmlu_other_val_rc_5shot_bpb": (MMLU, {"dataset_name": "other", "prompt_variations": 2, "metric_type": "bpb"}),
+    "mmlu_other_val_mc_5shot": (MMLU, {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True}),
+    "mmlu_other_val_mc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"},
+    ),
+    "mmlu_other_test_rc_var": (MMLU, {"dataset_name": "other", "split": "test", "prompt_variations": 1}),
+    "mmlu_other_test_rc_var_bpb": (
+        MMLU,
+        {"dataset_name": "other", "split": "test", "prompt_variations": 1, "metric_type": "bpb"},
+    ),
+    "mmlu_other_test_rc_5shot": (MMLU, {"dataset_name": "other", "split": "test", "prompt_variations": 2}),
+    "mmlu_other_test_rc_5shot_bpb": (
+        MMLU,
+        {"dataset_name": "other", "split": "test", "prompt_variations": 2, "metric_type": "bpb"},
+    ),
+    "mmlu_other_test_mc_5shot": (
+        MMLU,
+        {"dataset_name": "other", "split": "test", "prompt_variations": 2, "mc_labels": True},
+    ),
+    "mmlu_other_test_mc_5shot_bpb": (
+        MMLU,
+        {
+            "dataset_name": "other",
+            "split": "test",
+            "prompt_variations": 2,
+            "mc_labels": True,
+            "metric_type": "bpb",
+        },
+    ),
+}
+
+label_to_task_map = {
+    **label_to_task_map,
+    **label_to_task_map_new,
+}
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json
new file mode 100644
index 000000000..9046b4b0b
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge:mc", "task_hash": "11abfade7ecce501f3e3e72c937e19cc", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:58 UTC", "num_instances": 1172}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..2b15ca26a
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json
new file mode 100644
index 000000000..19413b36c
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge", "task_hash": "b122d520ab0cf70114350ecf00c5c811", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 21:50:18 UTC", "num_instances": 1172}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..c33b541bf
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json
new file mode 100644
index 000000000..32f357d25
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge:mc", "task_hash": "cf2769a2dc6cbea724ff477c3d2543a2", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:40 UTC", "num_instances": 1119}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..a50cc9525
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json
new file mode 100644
index 000000000..f82d472d6
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge", "task_hash": "9045ed0bd68a7e9ff34cf51ff24828bf", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 22:05:31 UTC", "num_instances": 1119}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..c93361804
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json
new file mode 100644
index 000000000..d54a3f276
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge:mc", "task_hash": "a673d7761ce3fc3d5061d72f76755971", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:49 UTC", "num_instances": 299}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..ad56ccd0c
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json
new file mode 100644
index 000000000..b61640e44
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_challenge", "task_hash": "bd181c90c43b3ef799af2f300ea09cf1", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 21:45:07 UTC", "num_instances": 299}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..2022bd65e
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json
new file mode 100644
index 000000000..f0d69c824
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy:mc", "task_hash": "64250ca6fdf0f02e07b539e8efc04922", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:33 UTC", "num_instances": 2376}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..54f5eef04
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json
new file mode 100644
index 000000000..78b2bd973
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy", "task_hash": "ccbbd993c851d3300140d81ffec0e397", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 21:50:27 UTC", "num_instances": 2376}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..0cac53fc3
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json
new file mode 100644
index 000000000..c872294b3
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy:mc", "task_hash": "afa7e96b485c4e4481b3b9b817faac36", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:16 UTC", "num_instances": 2251}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..ef8734c88
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json
new file mode 100644
index 000000000..cf32334d1
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy", "task_hash": "4a5241b308edb45d7b9eab594093c519", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 22:06:07 UTC", "num_instances": 2251}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..e974da827
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json
new file mode 100644
index 000000000..6706d940a
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy:mc", "task_hash": "443bd52f752399615d01c853a8d7386c", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:24 UTC", "num_instances": 570}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..60473206f
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json
new file mode 100644
index 000000000..ca3c39ec2
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "arc_easy", "task_hash": "0045e4f588a617cbe9ee5a4ae8ca1ce5", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 21:45:15 UTC", "num_instances": 570}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..4229a00e9
Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json
new file mode 100644
index 000000000..37f015b42
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "boolq:mc", "task_hash": "a92ca849d7efd331110145eb71e4fc09", "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}, "current_date": "2024-11-18 22:06:52 UTC", "num_instances": 9427}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..a63d7866a
Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json
new file mode 100644
index 000000000..24e2ea699
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "boolq", "task_hash": "ec8729b372d310aaf3a222f37a7af7b9", "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}, "current_date": "2024-11-18 22:06:41 UTC", "num_instances": 9427}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..7459af202
Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json
new file mode 100644
index 000000000..d50a38746
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "boolq:mc", "task_hash": "d88f45757f4a8c3802b7274857894a90", "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}, "current_date": "2024-11-18 22:07:01 UTC", "num_instances": 3270}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..8d349787b
Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json
new file mode 100644
index 000000000..883e60e63
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "boolq", "task_hash": "8942f8464f48343f6ab2773d4a75d344", "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}, "current_date": "2024-11-18 21:36:50 UTC", "num_instances": 3270}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..0d5ba6a18
Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json
new file mode 100644
index 000000000..415db9c24
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "csqa:mc", "task_hash": "a47780e5c1faaccf3586bfc1e5cb020c", "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}, "current_date": "2024-11-18 22:07:22 UTC", "num_instances": 9741}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..99bc9dadc
Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json
new file mode 100644
index 000000000..29c65685e
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "csqa", "task_hash": "d859c48ca5bd25f69a8a64ad585b4447", "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}, "current_date": "2024-11-18 22:07:11 UTC", "num_instances": 9741}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..0660de98c
Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json
new file mode 100644
index 000000000..cafadf56f
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "csqa:mc", "task_hash": "303a106ee47d83c3f9ae7e33bd993f0d", "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}, "current_date": "2024-11-18 22:07:31 UTC", "num_instances": 1221}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..eabd6c653
Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json
new file mode 100644
index 000000000..20387ae6f
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "csqa", "task_hash": "6246a67a1b7a81aaa134aaae4480f1b9", "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}, "current_date": "2024-11-18 21:37:21 UTC", "num_instances": 1221}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..cd1d32af7
Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json
new file mode 100644
index 000000000..7ee9e27b8
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "hellaswag:mc", "task_hash": "800769140f7d2dbfebbdcb6cabc30f0a", "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}, "current_date": "2024-11-18 22:08:27 UTC", "num_instances": 39905}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..af0dce9d0
Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json
new file mode 100644
index 000000000..39c016991
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "hellaswag", "task_hash": "25aa48c8c349d06cf85e1d9667d1c63f", "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}, "current_date": "2024-11-18 22:07:53 UTC", "num_instances": 39905}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..82d150785
Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json
new file mode 100644
index 000000000..901a0ca53
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "hellaswag:mc", "task_hash": "8c78527c6c17a8765b7a0b53353e7266", "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}, "current_date": "2024-11-18 22:08:50 UTC", "num_instances": 10042}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..e7bf8cba4
Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json
new file mode 100644
index 000000000..6913698e6
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "hellaswag", "task_hash": "e735ce12b24a16e9d583e6ee5bfe720f", "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}, "current_date": "2024-11-18 21:30:24 UTC", "num_instances": 10042}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..dff9599d9
Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json
new file mode 100644
index 000000000..57f770225
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa:mc", "task_hash": "c29899952d034341728ff084a971203b", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:27 UTC", "num_instances": 500}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..7204d4ebb
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json
new file mode 100644
index 000000000..c9ea826f0
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa", "task_hash": "85531fd2bc307ef4b3b7c8169838815e", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 21:50:36 UTC", "num_instances": 500}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..7f1e1d44e
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json
new file mode 100644
index 000000000..d46beb628
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa:mc", "task_hash": "dd163decb9d61b713173d24932ba4a8f", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:11 UTC", "num_instances": 4957}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..40f4896f1
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json
new file mode 100644
index 000000000..0cb8c61c9
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa", "task_hash": "7cad17f1f91900d30ccdc83be08e58ab", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 22:09:02 UTC", "num_instances": 4957}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..9103f06b0
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json
new file mode 100644
index 000000000..9cbaf5768
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa:mc", "task_hash": "e48b50ed535a645bd629f1dbe0366716", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:19 UTC", "num_instances": 500}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..aa4fbda4b
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json
new file mode 100644
index 000000000..423f9aeda
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "openbookqa", "task_hash": "abaed82ecf27bf6455dbef7b410fffda", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 21:41:51 UTC", "num_instances": 500}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..d8da65bf3
Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json
new file mode 100644
index 000000000..ebc1074a9
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "piqa:mc", "task_hash": "72a36e74abdbc603222dceff6a736141", "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:50 UTC", "num_instances": 16113}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..b51e19d99
Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json
new file mode 100644
index 000000000..fe1e12b9a
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "piqa", "task_hash": "88d4acbf580dfde078ed155408942b4f", "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}, "current_date": "2024-11-18 22:09:38 UTC", "num_instances": 16113}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..d04a2a737
Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json
new file mode 100644
index 000000000..8b6eb6988
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "piqa:mc", "task_hash": "3692c0fd172e26cd97e4a75b14cf257e", "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:00 UTC", "num_instances": 1838}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..c04afea2c
Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json
new file mode 100644
index 000000000..829c52d2a
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "piqa", "task_hash": "ca86822d78062d11a8a4a70b455d70bf", "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}, "current_date": "2024-11-18 21:42:25 UTC", "num_instances": 1838}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..3e330668e
Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json
new file mode 100644
index 000000000..36804a2f1
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "socialiqa:mc", "task_hash": "34126a4338f12048610e837d10c3ce03", "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:34 UTC", "num_instances": 33410}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..23ebb249b
Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json
new file mode 100644
index 000000000..73cf0ebf6
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "socialiqa", "task_hash": "8b5e1d3d948d7a8e8b58939cd9948504", "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}, "current_date": "2024-11-18 22:10:14 UTC", "num_instances": 33410}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..d71d37e10
Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json
new file mode 100644
index 000000000..b3a95ad38
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "socialiqa:mc", "task_hash": "47bf4f8936248fcad679637bf32a121d", "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:46 UTC", "num_instances": 1954}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..a940f9dbf
Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json
new file mode 100644
index 000000000..0a9c89675
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "socialiqa", "task_hash": "5d584a138e2c0203b49ed1a0bb68b2b6", "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}, "current_date": "2024-11-18 21:42:41 UTC", "num_instances": 1954}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..5cc9671f0
Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json
new file mode 100644
index 000000000..48d92c25d
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "winogrande:mc", "task_hash": "e056a4b5a7e921b8cda427d548e4352c", "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}, "current_date": "2024-11-18 22:11:12 UTC", "num_instances": 40398}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..2ec442b9c
Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json
new file mode 100644
index 000000000..06b3f3197
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "winogrande", "task_hash": "50dbaebd49327d22ae8f25be8bda8ed5", "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}, "current_date": "2024-11-18 22:10:57 UTC", "num_instances": 40398}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..52bc59f5d
Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json
new file mode 100644
index 000000000..255d33433
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "winogrande:mc", "task_hash": "681e052d61dceaa873326356fbebe83d", "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}, "current_date": "2024-11-18 22:11:22 UTC", "num_instances": 1267}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..b81f1cf39
Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz differ
diff --git a/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json
new file mode 100644
index 000000000..6a32cf380
--- /dev/null
+++ b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json
@@ -0,0 +1 @@
+{"task_name": "winogrande", "task_hash": "ef12b7df6d713ce11f8e8ff69dbe4b22", "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}, "current_date": "2024-11-18 21:43:43 UTC", "num_instances": 1267}
\ No newline at end of file
diff --git a/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz
new file mode 100644
index 000000000..20473318b
Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz differ