diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 2a9d1d365..a53c875c1 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -1863,3 +1863,502 @@ def doc_to_label(self, doc) -> int: {"dataset_path": "winogrande", "dataset_name": "rc_5shot", "metric_type": "bpb"}, ), } + +# This standardizes the metrics we should eval for the ladder. +# Train and test sets are added when applicable. +label_to_task_map_new = { + "arc_challenge_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "arc_challenge_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "arc_challenge_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "arc_challenge_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "arc_challenge_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "arc_challenge_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "arc_challenge_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "arc_challenge_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "arc_challenge_test_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"}, + ), + "arc_challenge_test_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "test_rc_5shot", "metric_type": "bpb"}, + ), + "arc_challenge_test_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "test_mc_5shot", "metric_type": "acc"}, + ), + "arc_challenge_test_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_challenge", "dataset_name": "test_mc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), # this used to be acc + "arc_easy_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "arc_easy_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "arc_easy_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "arc_easy_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_test_rc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"}, + ), + "arc_easy_test_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "test_rc_5shot", "metric_type": "bpb"}, + ), + "arc_easy_test_mc_5shot": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "test_mc_5shot", "metric_type": "acc"}, + ), + "arc_easy_test_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "arc_easy", "dataset_name": "test_mc_5shot", "metric_type": "bpb"}, + ), + "boolq_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), # this used to be acc + "boolq_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "boolq_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "boolq_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "boolq_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "boolq_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "boolq_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "boolq_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "boolq", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "csqa_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "csqa_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "csqa_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "csqa_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "csqa_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "csqa_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "csqa_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "csqa_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "csqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "hellaswag_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "hellaswag_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "hellaswag_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "hellaswag_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "hellaswag_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "hellaswag_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "hellaswag_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "hellaswag_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "hellaswag", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "openbookqa_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "openbookqa_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "openbookqa_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "openbookqa_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_test_rc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "len_norm"}, + ), + "openbookqa_test_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "test_rc_5shot", "metric_type": "bpb"}, + ), + "openbookqa_test_mc_5shot": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "test_mc_5shot", "metric_type": "acc"}, + ), + "openbookqa_test_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "openbookqa", "dataset_name": "test_mc_5shot", "metric_type": "bpb"}, + ), + "piqa_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "piqa_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "piqa_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "piqa_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "piqa_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "piqa_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "piqa_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "piqa_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "piqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "socialiqa_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), + "socialiqa_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "socialiqa_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "socialiqa_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "socialiqa_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "socialiqa_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "socialiqa_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "socialiqa_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "winogrande_train_rc_5shot": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "train_rc_5shot", "metric_type": "len_norm"}, + ), # this used to be acc + "winogrande_train_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "train_rc_5shot", "metric_type": "bpb"}, + ), + "winogrande_train_mc_5shot": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "train_mc_5shot", "metric_type": "acc"}, + ), + "winogrande_train_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "train_mc_5shot", "metric_type": "bpb"}, + ), + "winogrande_val_rc_5shot": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "len_norm"}, + ), + "winogrande_val_rc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "val_rc_5shot", "metric_type": "bpb"}, + ), + "winogrande_val_mc_5shot": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "val_mc_5shot", "metric_type": "acc"}, + ), + "winogrande_val_mc_5shot_bpb": ( + OEEvalTask, + {"dataset_path": "winogrande", "dataset_name": "val_mc_5shot", "metric_type": "bpb"}, + ), + "mmlu_stem_val_rc_var": (MMLU, {"dataset_name": "stem", "prompt_variations": 1}), + "mmlu_stem_val_rc_var_bpb": (MMLU, {"dataset_name": "stem", "prompt_variations": 1, "metric_type": "bpb"}), + "mmlu_stem_val_rc_5shot": (MMLU, {"dataset_name": "stem", "prompt_variations": 2}), + "mmlu_stem_val_rc_5shot_bpb": (MMLU, {"dataset_name": "stem", "prompt_variations": 2, "metric_type": "bpb"}), + "mmlu_stem_val_mc_5shot": (MMLU, {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True}), + "mmlu_stem_val_mc_5shot_bpb": ( + MMLU, + {"dataset_name": "stem", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"}, + ), + "mmlu_stem_test_rc_var": (MMLU, {"dataset_name": "stem", "split": "test", "prompt_variations": 1}), + "mmlu_stem_test_rc_var_bpb": ( + MMLU, + {"dataset_name": "stem", "split": "test", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_stem_test_rc_5shot": (MMLU, {"dataset_name": "stem", "split": "test", "prompt_variations": 2}), + "mmlu_stem_test_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_stem_test_mc_5shot": ( + MMLU, + {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_stem_test_mc_5shot_bpb": ( + MMLU, + {"dataset_name": "stem", "split": "test", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"}, + ), + "mmlu_humanities_val_rc_var": (MMLU, {"dataset_name": "humanities", "prompt_variations": 1}), + "mmlu_humanities_val_rc_var_bpb": ( + MMLU, + {"dataset_name": "humanities", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_humanities_val_rc_5shot": (MMLU, {"dataset_name": "humanities", "prompt_variations": 2}), + "mmlu_humanities_val_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "humanities", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_humanities_val_mc_5shot": ( + MMLU, + {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_humanities_val_mc_5shot_bpb": ( + MMLU, + {"dataset_name": "humanities", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"}, + ), + "mmlu_humanities_test_rc_var": (MMLU, {"dataset_name": "humanities", "split": "test", "prompt_variations": 1}), + "mmlu_humanities_test_rc_var_bpb": ( + MMLU, + {"dataset_name": "humanities", "split": "test", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_humanities_test_rc_5shot": ( + MMLU, + {"dataset_name": "humanities", "split": "test", "prompt_variations": 2}, + ), + "mmlu_humanities_test_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "humanities", "split": "test", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_humanities_test_mc_5shot": ( + MMLU, + {"dataset_name": "humanities", "split": "test", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_humanities_test_mc_5shot_bpb": ( + MMLU, + { + "dataset_name": "humanities", + "split": "test", + "prompt_variations": 2, + "mc_labels": True, + "metric_type": "bpb", + }, + ), + "mmlu_social_sciences_val_rc_var": (MMLU, {"dataset_name": "social_sciences", "prompt_variations": 1}), + "mmlu_social_sciences_val_rc_var_bpb": ( + MMLU, + {"dataset_name": "social_sciences", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_social_sciences_val_rc_5shot": (MMLU, {"dataset_name": "social_sciences", "prompt_variations": 2}), + "mmlu_social_sciences_val_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "social_sciences", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_social_sciences_val_mc_5shot": ( + MMLU, + {"dataset_name": "social_sciences", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_social_sciences_val_mc_5shot_bpb": ( + MMLU, + {"dataset_name": "social_sciences", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"}, + ), + "mmlu_social_sciences_test_rc_var": ( + MMLU, + {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 1}, + ), + "mmlu_social_sciences_test_rc_var_bpb": ( + MMLU, + {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_social_sciences_test_rc_5shot": ( + MMLU, + {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2}, + ), + "mmlu_social_sciences_test_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_social_sciences_test_mc_5shot": ( + MMLU, + {"dataset_name": "social_sciences", "split": "test", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_social_sciences_test_mc_5shot_bpb": ( + MMLU, + { + "dataset_name": "social_sciences", + "split": "test", + "prompt_variations": 2, + "mc_labels": True, + "metric_type": "bpb", + }, + ), + "mmlu_other_val_rc_var": (MMLU, {"dataset_name": "other", "prompt_variations": 1}), + "mmlu_other_val_rc_var_bpb": (MMLU, {"dataset_name": "other", "prompt_variations": 1, "metric_type": "bpb"}), + "mmlu_other_val_rc_5shot": (MMLU, {"dataset_name": "other", "prompt_variations": 2}), + "mmlu_other_val_rc_5shot_bpb": (MMLU, {"dataset_name": "other", "prompt_variations": 2, "metric_type": "bpb"}), + "mmlu_other_val_mc_5shot": (MMLU, {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True}), + "mmlu_other_val_mc_5shot_bpb": ( + MMLU, + {"dataset_name": "other", "prompt_variations": 2, "mc_labels": True, "metric_type": "bpb"}, + ), + "mmlu_other_test_rc_var": (MMLU, {"dataset_name": "other", "split": "test", "prompt_variations": 1}), + "mmlu_other_test_rc_var_bpb": ( + MMLU, + {"dataset_name": "other", "split": "test", "prompt_variations": 1, "metric_type": "bpb"}, + ), + "mmlu_other_test_rc_5shot": (MMLU, {"dataset_name": "other", "split": "test", "prompt_variations": 2}), + "mmlu_other_test_rc_5shot_bpb": ( + MMLU, + {"dataset_name": "other", "split": "test", "prompt_variations": 2, "metric_type": "bpb"}, + ), + "mmlu_other_test_mc_5shot": ( + MMLU, + {"dataset_name": "other", "split": "test", "prompt_variations": 2, "mc_labels": True}, + ), + "mmlu_other_test_mc_5shot_bpb": ( + MMLU, + { + "dataset_name": "other", + "split": "test", + "prompt_variations": 2, + "mc_labels": True, + "metric_type": "bpb", + }, + ), +} + +label_to_task_map = { + **label_to_task_map, + **label_to_task_map_new, +} diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json new file mode 100644 index 000000000..9046b4b0b --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge:mc", "task_hash": "11abfade7ecce501f3e3e72c937e19cc", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:58 UTC", "num_instances": 1172} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..2b15ca26a Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/test_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json new file mode 100644 index 000000000..19413b36c --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge", "task_hash": "b122d520ab0cf70114350ecf00c5c811", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 21:50:18 UTC", "num_instances": 1172} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..c33b541bf Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/test_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json new file mode 100644 index 000000000..32f357d25 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge:mc", "task_hash": "cf2769a2dc6cbea724ff477c3d2543a2", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:40 UTC", "num_instances": 1119} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..a50cc9525 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json new file mode 100644 index 000000000..f82d472d6 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge", "task_hash": "9045ed0bd68a7e9ff34cf51ff24828bf", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 22:05:31 UTC", "num_instances": 1119} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..c93361804 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json new file mode 100644 index 000000000..d54a3f276 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge:mc", "task_hash": "a673d7761ce3fc3d5061d72f76755971", "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "current_date": "2024-11-18 22:05:49 UTC", "num_instances": 299} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..ad56ccd0c Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json new file mode 100644 index 000000000..b61640e44 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_challenge", "task_hash": "bd181c90c43b3ef799af2f300ea09cf1", "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "current_date": "2024-11-18 21:45:07 UTC", "num_instances": 299} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..2022bd65e Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_challenge/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json new file mode 100644 index 000000000..f0d69c824 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy:mc", "task_hash": "64250ca6fdf0f02e07b539e8efc04922", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:33 UTC", "num_instances": 2376} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..54f5eef04 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/test_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json new file mode 100644 index 000000000..78b2bd973 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy", "task_hash": "ccbbd993c851d3300140d81ffec0e397", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 21:50:27 UTC", "num_instances": 2376} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..0cac53fc3 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/test_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json new file mode 100644 index 000000000..c872294b3 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy:mc", "task_hash": "afa7e96b485c4e4481b3b9b817faac36", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:16 UTC", "num_instances": 2251} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..ef8734c88 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json new file mode 100644 index 000000000..cf32334d1 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy", "task_hash": "4a5241b308edb45d7b9eab594093c519", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 22:06:07 UTC", "num_instances": 2251} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..e974da827 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json new file mode 100644 index 000000000..6706d940a --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy:mc", "task_hash": "443bd52f752399615d01c853a8d7386c", "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "current_date": "2024-11-18 22:06:24 UTC", "num_instances": 570} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..60473206f Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json new file mode 100644 index 000000000..ca3c39ec2 --- /dev/null +++ b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "arc_easy", "task_hash": "0045e4f588a617cbe9ee5a4ae8ca1ce5", "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "current_date": "2024-11-18 21:45:15 UTC", "num_instances": 570} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..4229a00e9 Binary files /dev/null and b/olmo_data/oe_eval_tasks/arc_easy/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json new file mode 100644 index 000000000..37f015b42 --- /dev/null +++ b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "boolq:mc", "task_hash": "a92ca849d7efd331110145eb71e4fc09", "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}, "current_date": "2024-11-18 22:06:52 UTC", "num_instances": 9427} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..a63d7866a Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json new file mode 100644 index 000000000..24e2ea699 --- /dev/null +++ b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "boolq", "task_hash": "ec8729b372d310aaf3a222f37a7af7b9", "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}, "current_date": "2024-11-18 22:06:41 UTC", "num_instances": 9427} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..7459af202 Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json new file mode 100644 index 000000000..d50a38746 --- /dev/null +++ b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "boolq:mc", "task_hash": "d88f45757f4a8c3802b7274857894a90", "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}, "current_date": "2024-11-18 22:07:01 UTC", "num_instances": 3270} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..8d349787b Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json new file mode 100644 index 000000000..883e60e63 --- /dev/null +++ b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "boolq", "task_hash": "8942f8464f48343f6ab2773d4a75d344", "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}, "current_date": "2024-11-18 21:36:50 UTC", "num_instances": 3270} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..0d5ba6a18 Binary files /dev/null and b/olmo_data/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json new file mode 100644 index 000000000..415db9c24 --- /dev/null +++ b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "csqa:mc", "task_hash": "a47780e5c1faaccf3586bfc1e5cb020c", "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}, "current_date": "2024-11-18 22:07:22 UTC", "num_instances": 9741} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..99bc9dadc Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json new file mode 100644 index 000000000..29c65685e --- /dev/null +++ b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "csqa", "task_hash": "d859c48ca5bd25f69a8a64ad585b4447", "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}, "current_date": "2024-11-18 22:07:11 UTC", "num_instances": 9741} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..0660de98c Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json new file mode 100644 index 000000000..cafadf56f --- /dev/null +++ b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "csqa:mc", "task_hash": "303a106ee47d83c3f9ae7e33bd993f0d", "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}, "current_date": "2024-11-18 22:07:31 UTC", "num_instances": 1221} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..eabd6c653 Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json new file mode 100644 index 000000000..20387ae6f --- /dev/null +++ b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "csqa", "task_hash": "6246a67a1b7a81aaa134aaae4480f1b9", "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}, "current_date": "2024-11-18 21:37:21 UTC", "num_instances": 1221} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..cd1d32af7 Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json new file mode 100644 index 000000000..7ee9e27b8 --- /dev/null +++ b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "hellaswag:mc", "task_hash": "800769140f7d2dbfebbdcb6cabc30f0a", "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}, "current_date": "2024-11-18 22:08:27 UTC", "num_instances": 39905} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..af0dce9d0 Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json new file mode 100644 index 000000000..39c016991 --- /dev/null +++ b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "hellaswag", "task_hash": "25aa48c8c349d06cf85e1d9667d1c63f", "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}, "current_date": "2024-11-18 22:07:53 UTC", "num_instances": 39905} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..82d150785 Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json new file mode 100644 index 000000000..901a0ca53 --- /dev/null +++ b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "hellaswag:mc", "task_hash": "8c78527c6c17a8765b7a0b53353e7266", "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}, "current_date": "2024-11-18 22:08:50 UTC", "num_instances": 10042} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..e7bf8cba4 Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json new file mode 100644 index 000000000..6913698e6 --- /dev/null +++ b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "hellaswag", "task_hash": "e735ce12b24a16e9d583e6ee5bfe720f", "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}, "current_date": "2024-11-18 21:30:24 UTC", "num_instances": 10042} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..dff9599d9 Binary files /dev/null and b/olmo_data/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json new file mode 100644 index 000000000..57f770225 --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa:mc", "task_hash": "c29899952d034341728ff084a971203b", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:27 UTC", "num_instances": 500} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..7204d4ebb Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/test_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json new file mode 100644 index 000000000..c9ea826f0 --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa", "task_hash": "85531fd2bc307ef4b3b7c8169838815e", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 21:50:36 UTC", "num_instances": 500} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..7f1e1d44e Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/test_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json new file mode 100644 index 000000000..d46beb628 --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa:mc", "task_hash": "dd163decb9d61b713173d24932ba4a8f", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:11 UTC", "num_instances": 4957} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..40f4896f1 Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json new file mode 100644 index 000000000..0cb8c61c9 --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa", "task_hash": "7cad17f1f91900d30ccdc83be08e58ab", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 22:09:02 UTC", "num_instances": 4957} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..9103f06b0 Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json new file mode 100644 index 000000000..9cbaf5768 --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa:mc", "task_hash": "e48b50ed535a645bd629f1dbe0366716", "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:19 UTC", "num_instances": 500} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..aa4fbda4b Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json new file mode 100644 index 000000000..423f9aeda --- /dev/null +++ b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "openbookqa", "task_hash": "abaed82ecf27bf6455dbef7b410fffda", "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "current_date": "2024-11-18 21:41:51 UTC", "num_instances": 500} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..d8da65bf3 Binary files /dev/null and b/olmo_data/oe_eval_tasks/openbookqa/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json new file mode 100644 index 000000000..ebc1074a9 --- /dev/null +++ b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "piqa:mc", "task_hash": "72a36e74abdbc603222dceff6a736141", "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}, "current_date": "2024-11-18 22:09:50 UTC", "num_instances": 16113} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..b51e19d99 Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json new file mode 100644 index 000000000..fe1e12b9a --- /dev/null +++ b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "piqa", "task_hash": "88d4acbf580dfde078ed155408942b4f", "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}, "current_date": "2024-11-18 22:09:38 UTC", "num_instances": 16113} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..d04a2a737 Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json new file mode 100644 index 000000000..8b6eb6988 --- /dev/null +++ b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "piqa:mc", "task_hash": "3692c0fd172e26cd97e4a75b14cf257e", "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:00 UTC", "num_instances": 1838} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..c04afea2c Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json new file mode 100644 index 000000000..829c52d2a --- /dev/null +++ b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "piqa", "task_hash": "ca86822d78062d11a8a4a70b455d70bf", "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}, "current_date": "2024-11-18 21:42:25 UTC", "num_instances": 1838} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..3e330668e Binary files /dev/null and b/olmo_data/oe_eval_tasks/piqa/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json new file mode 100644 index 000000000..36804a2f1 --- /dev/null +++ b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "socialiqa:mc", "task_hash": "34126a4338f12048610e837d10c3ce03", "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:34 UTC", "num_instances": 33410} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..23ebb249b Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json new file mode 100644 index 000000000..73cf0ebf6 --- /dev/null +++ b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "socialiqa", "task_hash": "8b5e1d3d948d7a8e8b58939cd9948504", "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}, "current_date": "2024-11-18 22:10:14 UTC", "num_instances": 33410} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..d71d37e10 Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json new file mode 100644 index 000000000..b3a95ad38 --- /dev/null +++ b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "socialiqa:mc", "task_hash": "47bf4f8936248fcad679637bf32a121d", "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}, "current_date": "2024-11-18 22:10:46 UTC", "num_instances": 1954} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..a940f9dbf Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json new file mode 100644 index 000000000..0a9c89675 --- /dev/null +++ b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "socialiqa", "task_hash": "5d584a138e2c0203b49ed1a0bb68b2b6", "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}, "current_date": "2024-11-18 21:42:41 UTC", "num_instances": 1954} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..5cc9671f0 Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/val_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json new file mode 100644 index 000000000..48d92c25d --- /dev/null +++ b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "winogrande:mc", "task_hash": "e056a4b5a7e921b8cda427d548e4352c", "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}, "current_date": "2024-11-18 22:11:12 UTC", "num_instances": 40398} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..2ec442b9c Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/train_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json new file mode 100644 index 000000000..06b3f3197 --- /dev/null +++ b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "winogrande", "task_hash": "50dbaebd49327d22ae8f25be8bda8ed5", "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": 1000000, "split": "train", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}, "current_date": "2024-11-18 22:10:57 UTC", "num_instances": 40398} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..52bc59f5d Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/train_rc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json new file mode 100644 index 000000000..255d33433 --- /dev/null +++ b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "winogrande:mc", "task_hash": "681e052d61dceaa873326356fbebe83d", "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}, "current_date": "2024-11-18 22:11:22 UTC", "num_instances": 1267} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..b81f1cf39 Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/val_mc_5shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json new file mode 100644 index 000000000..6a32cf380 --- /dev/null +++ b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/config.json @@ -0,0 +1 @@ +{"task_name": "winogrande", "task_hash": "ef12b7df6d713ce11f8e8ff69dbe4b22", "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": 1000000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {}, "metric_kwargs": {}, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}, "current_date": "2024-11-18 21:43:43 UTC", "num_instances": 1267} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz new file mode 100644 index 000000000..20473318b Binary files /dev/null and b/olmo_data/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz differ