From a244fc1a0a2e7686b816d6a32dee3b5636ce9eee Mon Sep 17 00:00:00 2001 From: sileod Date: Fri, 6 Jan 2023 15:20:37 +0100 Subject: [PATCH] . --- src/tasknet/models.py | 96 ++++++---- src/tasknet/tasks.py | 31 ++-- src/tasknet/utils.py | 44 ++++- src/tasks.md | 422 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 543 insertions(+), 50 deletions(-) create mode 100644 src/tasks.md diff --git a/src/tasknet/models.py b/src/tasknet/models.py index d98f4fd..a21f6da 100755 --- a/src/tasknet/models.py +++ b/src/tasknet/models.py @@ -8,65 +8,88 @@ from torch.utils.data.sampler import RandomSampler, WeightedRandomSampler from typing import List, Union, Dict from transformers import ( - AutoModelForSeq2SeqLM, + EncoderDecoderModel, DataCollatorForSeq2Seq, + AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForMultipleChoice, AutoModelForTokenClassification, ) -from transformers import EncoderDecoderModel from easydict import EasyDict as edict import funcy as fc import copy import logging -import functools from types import MappingProxyType from .tasks import Classification -from .utils import to_dict +from .utils import to_dict, shallow_copy_A_to_B, deep_copy_cache, normalize_label from transformers import AutoTokenizer import magicattr +import gc +import random + +def progress(l): + try: + from tqdm.auto import tqdm + assert len(l)>8 + return tqdm(l) + except: + return l + class CLSEmbedding(nn.Module): - def __init__(self, Zi): + def __init__(self, Zi, drop_probability=0.0): super().__init__() self.cls = Zi - + self.drop_probability=drop_probability def forward(self, x): - x[:, 0, :] = x[:, 0, :] + self.cls + if random.random()>self.drop_probability: + x[:, 0, :] = x[:, 0, :] + self.cls return x +class WandbTaskCallback(transformers.integrations.WandbCallback): + + def on_log(self, args, state, control, model=None, logs=None, **kwargs): + import wandb + if not self._initialized: + self.setup(args, state, model, reinit=False) + if state.is_world_process_zero: + if 'eval_name' in logs: + logs={f"{logs['eval_name']}/{k}" :v for (k,v) in logs.items() if k!="eval_name"} + wandb.log(logs, step=state.global_step) + class Model(transformers.PreTrainedModel): def __init__(self, tasks, args, warm_start=None): super().__init__(transformers.PretrainedConfig()) + args=to_dict(args) self.shared_encoder = warm_start - mc_model = None self.models={} task_models_list = [] - for i, task in enumerate(tasks): + for i, task in progress(enumerate(tasks)): model_type = eval(f"AutoModelFor{task.task_type}") nl = {a: getattr(task, a) for a in ('num_labels','problem_type') if hasattr(task, a) } - model = model_type.from_pretrained(args.model_name, **nl) + model = deep_copy_cache(model_type.from_pretrained)(args.model_name, **nl) if task.task_type=='MultipleChoice': - key="mc" + key=task.task_type else: labels = getattr(task.dataset['train'].features[task.y],"names",None) - key=(tuple(labels) if labels else None) + key= tuple([normalize_label(x) for x in labels]) if labels else None + key = key if task.num_labels!=2 or key else "binary" if key and key not in self.models: self.models[key] = model if key and key in self.models: - self.shallow_copy(self.models[key].classifier, model.classifier) + model.classifier.weight = self.models[key].classifier.weight model.auto = getattr(model, self.get_encoder_attr_name(model)) if self.shared_encoder is None: self.shared_encoder = model.auto else: - self.shallow_copy(self.shared_encoder, model.auto) + shallow_copy_A_to_B(self.shared_encoder, model.auto) task_models_list += [model] model.i = i @@ -85,32 +108,18 @@ def __init__(self, tasks, args, warm_start=None): emb_name, emb_module = [(name,module) for name,module in m_i.named_modules() if isinstance(module,torch.nn.Embedding)][0] magicattr.set(m_i, emb_name, - nn.Sequential(emb_module, CLSEmbedding(self.Z[i])) + nn.Sequential(emb_module, + CLSEmbedding( + self.Z[i], + drop_probability=args.get('cls_emb_drop_probability',0.0)) + ) ) + torch.cuda.empty_cache() + gc.collect() def set_encoder(self,encoder): for model in self.task_models_list: - self.shallow_copy(encoder, getattr(model, self.get_encoder_attr_name(model))) - - - @staticmethod - def shallow_copy(A, B): - """Shallow copy (=parameter sharing) A into B - https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427""" - - def rsetattr(obj, attr, val): - pre, _, post = attr.rpartition(".") - return setattr(rgetattr(obj, pre) if pre else obj, post, val) - - def rgetattr(obj, attr, *args): - def _getattr(obj, attr): - return getattr(obj, attr, *args) - - return functools.reduce(_getattr, [obj] + attr.split(".")) - - for (na, _), (nb, _) in zip(A.named_parameters(), B.named_parameters()): - rsetattr(B, nb, rgetattr(A, na)) - return A, B + shallow_copy_A_to_B(encoder, getattr(model, self.get_encoder_attr_name(model))) @classmethod def get_encoder_attr_name(cls, model): @@ -249,8 +258,18 @@ class default: # transformerS.Trainer recognizes eval_dataset instances of "dict" # But we use a custom "evaluate" function so that we can use different metrics for each task self.eval_dataset = MappingProxyType(self.eval_dataset) + self.fix_callback() self.cleanup_outputs() + def fix_callback(self): + try: + import wandb + except: + return + i=[i for (i,c) in enumerate(self.callback_handler.callbacks) if 'Wandb' in str(c)] + if i: + self.callback_handler.callbacks[i[0]] = WandbTaskCallback() + @staticmethod def cleanup_outputs(): try: @@ -274,8 +293,9 @@ def write_line(other, values): def evaluate(self, **kwargs): try: - self.callback_handler.callbacks[-1].training_tracker.write_line = fc.partial( - self.write_line, self.callback_handler.callbacks[-1].training_tracker + i=[i for (i,c) in enumerate(self.callback_handler.callbacks) if 'NotebookProgress' in str(c)][0] + self.callback_handler.callbacks[i].training_tracker.write_line = fc.partial( + self.write_line, self.callback_handler.callbacks[i].training_tracker ) except: logging.info('No training_tracker') diff --git a/src/tasknet/tasks.py b/src/tasknet/tasks.py index 9f6a03f..cda6bbf 100755 --- a/src/tasknet/tasks.py +++ b/src/tasknet/tasks.py @@ -27,13 +27,23 @@ def get_dataset_name(dataset): except: return "" -def sample_dataset(dataset,n=10000, n_eval=1000): +def oversample(dataset, n=2): + dataset['train']= datasets.concatenate_datasets( + [dataset['train'].shuffle(_) for _ in range(n)] + ) + return dataset + +def sample_dataset(dataset,n=10000, n_eval=1000, oversampling=None): + if oversampling and len(dataset['train'])n_k: dataset[k]=dataset[k].train_test_split(train_size=n_k)['train'] return dataset + @dataclass class Task: dataset: Dataset = None @@ -42,6 +52,7 @@ class Task: tokenizer_kwargs: ... = fdict(padding="max_length", max_length=256,truncation=True) max_rows:int=None max_rows_eval:int=None + oversampling:int=None def __hash__(self): return hash(str(self.dataset.__dict__)) @@ -60,7 +71,8 @@ def __post_init__(self): if not self.name: self.name = name - self.dataset=sample_dataset(self.dataset,self.max_rows,self.max_rows_eval) + self.results=[] + self.dataset=sample_dataset(self.dataset,self.max_rows,self.max_rows_eval, self.oversampling) def check(): return True @@ -130,7 +142,9 @@ def compute_metrics(self, eval_pred): else: metric = load_metric("glue", "stsb") meta = {"name": self.name, "size": len(predictions), "index": self.index} - return {**metric.compute(predictions=predictions, references=labels,**avg), **meta} + metrics = metric.compute(predictions=predictions, references=labels,**avg) + self.results+=[metrics] + return {**metrics, **meta} @dataclass @@ -283,14 +297,9 @@ def compute_metrics(self, eval_pred): predictions=true_predictions, references=true_labels ) meta = {"name": self.name, "size": len(predictions), "index": self.index} - - return { - "precision": all_metrics["overall_precision"], - "recall": all_metrics["overall_recall"], - "f1": all_metrics["overall_f1"], - "accuracy": all_metrics["overall_accuracy"], - **meta, - } + metrics = {k.replace("overall_",""):v for k,v in all_metrics.items() if "overall" in k} + self.results+=[metrics] + return {**metrics, **meta} def check(self): features = self.dataset['train'].features diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py index db7b4da..7de6337 100755 --- a/src/tasknet/utils.py +++ b/src/tasknet/utils.py @@ -1,5 +1,7 @@ from datasets import DatasetDict, Dataset, load_dataset from easydict import EasyDict as edict +import copy +import functools def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0): @@ -23,4 +25,44 @@ def to_dict(x): return edict(x) else: x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')}) - return x \ No newline at end of file + return x + +def deep_copy_cache(function): + memo = {} + def wrapper(*args, **kwargs): + if args in memo: + return copy.deepcopy(memo[args]) + else: + rv = function(*args, **kwargs) + memo[args] = rv + return rv + return wrapper + +def shallow_copy_A_to_B(A, B): + """Shallow copy (=parameter sharing) A into B + https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427""" + + def rsetattr(obj, attr, val): + pre, _, post = attr.rpartition(".") + return setattr(rgetattr(obj, pre) if pre else obj, post, val) + + def rgetattr(obj, attr, *args): + def _getattr(obj, attr): + return getattr(obj, attr, *args) + + return functools.reduce(_getattr, [obj] + attr.split(".")) + + for (na, _), (nb, _) in zip(A.named_parameters(), B.named_parameters()): + rsetattr(B, nb, rgetattr(A, na)) + return A, B + +def normalize_label(label): + label=str(label).lower() + label=label.replace('-','_') + label=label.replace(' ','_') + label=label.replace('entailed', 'entailment') + label=label.replace('non_','not_') + label=label.replace('duplicate','equivalent') + label=label.replace('neg','negative') + label=label.replace('pos','positive') + return label \ No newline at end of file diff --git a/src/tasks.md b/src/tasks.md new file mode 100644 index 0000000..c1079ab --- /dev/null +++ b/src/tasks.md @@ -0,0 +1,422 @@ +| id | dataset_name | config_name | task_name | preprocessing_name | task_type | mapping | +|:---------------------------------------------------------------|:-----------------------------------------------|:----------------------------------------------------|:--------------|:-----------------------------------------------|:--------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| anli/r1 | anli | | r1 | anli__r1 | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train_r1', 'dev_r1', 'test_r1'], dataset_name=None, config_name=None) | +| anli/r2 | anli | | r2 | anli__r2 | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train_r2', 'dev_r2', 'test_r2'], dataset_name=None, config_name=None) | +| anli/r3 | anli | | r3 | anli__r3 | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train_r3', 'dev_r3', 'test_r3'], dataset_name=None, config_name=None) | +| sick/label | sick | | label | sick__label | Classification | Classification(sentence1='sentence_A', sentence2='sentence_B', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| sick/relatedness | sick | | relatedness | sick__relatedness | Classification | Classification(sentence1='sentence_A', sentence2='sentence_B', labels='relatedness_score', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| sick/entailment_AB | sick | | entailment_AB | sick__entailment_AB | Classification | Classification(sentence1='sentence_A', sentence2='sentence_B', labels='entailment_AB', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| sick/entailment_BA | sick | | entailment_BA | sick__entailment_BA | Classification | Classification(sentence1='sentence_A', sentence2='sentence_B', labels='entailment_BA', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| snli | snli | | | snli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| scitail/snli_format | scitail | snli_format | | scitail | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='gold_label', splits=('train', 'validation', 'test'), dataset_name=None, config_name='snli_format') | +| hans | hans | | | hans | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| WANLI | alisawuffles/WANLI | | | wanli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='gold', splits=('train', 'validation', 'test'), dataset_name='alisawuffles/WANLI', config_name=None) | +| recast/recast_megaveridicality | metaeval/recast | recast_megaveridicality | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_sentiment | metaeval/recast | recast_sentiment | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_ner | metaeval/recast | recast_ner | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_verbcorner | metaeval/recast | recast_verbcorner | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_verbnet | metaeval/recast | recast_verbnet | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_factuality | metaeval/recast | recast_factuality | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_puns | metaeval/recast | recast_puns | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| recast/recast_kg_relations | metaeval/recast | recast_kg_relations | | recast | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/recast', config_name=['recast_kg_relations', 'recast_puns', 'recast_factuality', 'recast_verbnet', 'recast_verbcorner', 'recast_ner', 'recast_sentiment', 'recast_megaveridicality']) | +| probability_words_nli/reasoning_2hop | sileod/probability_words_nli | reasoning_2hop | | probability_words_nli | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='sileod/probability_words_nli', config_name=['reasoning_1hop', 'reasoning_2hop', 'usnli']) | +| probability_words_nli/usnli | sileod/probability_words_nli | usnli | | probability_words_nli | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='sileod/probability_words_nli', config_name=['reasoning_1hop', 'reasoning_2hop', 'usnli']) | +| probability_words_nli/reasoning_1hop | sileod/probability_words_nli | reasoning_1hop | | probability_words_nli | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='sileod/probability_words_nli', config_name=['reasoning_1hop', 'reasoning_2hop', 'usnli']) | +| nan-nli/joey234--nan-nli | joey234/nan-nli | joey234--nan-nli | | nan_nli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='joey234/nan-nli', config_name='joey234--nan-nli') | +| gen_debiased_nli/snli_seq_z | pietrolesci/gen_debiased_nli | | snli_seq_z | gen_debiased_nli__snli_seq_z | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['snli_seq_z', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| gen_debiased_nli/snli_z_aug | pietrolesci/gen_debiased_nli | | snli_z_aug | gen_debiased_nli__snli_z_aug | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['snli_z_aug', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| gen_debiased_nli/snli_par_z | pietrolesci/gen_debiased_nli | | snli_par_z | gen_debiased_nli__snli_par_z | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['snli_par_z', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| gen_debiased_nli/mnli_par_z | pietrolesci/gen_debiased_nli | | mnli_par_z | gen_debiased_nli__mnli_par_z | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['mnli_par_z', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| gen_debiased_nli/mnli_z_aug | pietrolesci/gen_debiased_nli | | mnli_z_aug | gen_debiased_nli__mnli_z_aug | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['mnli_z_aug', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| gen_debiased_nli/mnli_seq_z | pietrolesci/gen_debiased_nli | | mnli_seq_z | gen_debiased_nli__mnli_seq_z | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['mnli_seq_z', None, None], dataset_name='pietrolesci/gen_debiased_nli', config_name=None) | +| glue_diagnostics/diagnostics | pietrolesci/glue_diagnostics | | diagnostics | glue__diagnostics | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['test', None, None], dataset_name='pietrolesci/glue_diagnostics', config_name=None) | +| breaking_nli | pietrolesci/breaking_nli | | | breaking_nli | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=['full', None, None], dataset_name='pietrolesci/breaking_nli', config_name=None) | +| conj_nli | pietrolesci/conj_nli | | | conj_nli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/conj_nli', config_name=None) | +| fracas | pietrolesci/fracas | | | fracas | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/fracas', config_name=None) | +| dialogue_nli | pietrolesci/dialogue_nli | | | dialogue_nli | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/dialogue_nli', config_name=None) | +| dnc | pietrolesci/dnc | | | dnc_nli | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/dnc', config_name=None) | +| gpt3_nli | pietrolesci/gpt3_nli | | | gpt3_nli | Classification | Classification(sentence1='text_a', sentence2='text_b', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/gpt3_nli', config_name=None) | +| recast_white/fnplus | pietrolesci/recast_white | | fnplus | recast_white__fnplus | Classification | Classification(sentence1='text', sentence2='hypothesis', labels='label', splits=['fnplus', None, None], dataset_name='pietrolesci/recast_white', config_name=None) | +| recast_white/sprl | pietrolesci/recast_white | | sprl | recast_white__sprl | Classification | Classification(sentence1='text', sentence2='hypothesis', labels='label', splits=['sprl', None, None], dataset_name='pietrolesci/recast_white', config_name=None) | +| recast_white/dpr | pietrolesci/recast_white | | dpr | recast_white__dpr | Classification | Classification(sentence1='text', sentence2='hypothesis', labels='label', splits=['dpr', None, None], dataset_name='pietrolesci/recast_white', config_name=None) | +| joci | pietrolesci/joci | | | joci | Classification | Classification(sentence1='context', sentence2='hypothesis', labels='label', splits=['full', None, None], dataset_name='pietrolesci/joci', config_name=None) | +| enfever_nli | ctu-aic/enfever_nli | | | enfever_nli | Classification | Classification(sentence1='evidence', sentence2='claim', labels='label', splits=('train', 'validation', 'test'), dataset_name='ctu-aic/enfever_nli', config_name=None) | +| contrast_nli | martn-nguyen/contrast_nli | | | contrast_nli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='martn-nguyen/contrast_nli', config_name=None) | +| robust_nli/IS_CS | pietrolesci/robust_nli | | IS_CS | robust_nli__IS_CS | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['IS_CS', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/LI_LI | pietrolesci/robust_nli | | LI_LI | robust_nli__LI_LI | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['LI_LI', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/ST_WO | pietrolesci/robust_nli | | ST_WO | robust_nli__ST_WO | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['ST_WO', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/PI_SP | pietrolesci/robust_nli | | PI_SP | robust_nli__PI_SP | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['PI_SP', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/PI_CD | pietrolesci/robust_nli | | PI_CD | robust_nli__PI_CD | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['PI_CD', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/ST_SE | pietrolesci/robust_nli | | ST_SE | robust_nli__ST_SE | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['ST_SE', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/ST_NE | pietrolesci/robust_nli | | ST_NE | robust_nli__ST_NE | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['ST_NE', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli/ST_LM | pietrolesci/robust_nli | | ST_LM | robust_nli__ST_LM | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['ST_LM', None, None], dataset_name='pietrolesci/robust_nli', config_name=None) | +| robust_nli_is_sd | pietrolesci/robust_nli_is_sd | | | robust_nli_is_sd | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/robust_nli_is_sd', config_name=None) | +| robust_nli_li_ts | pietrolesci/robust_nli_li_ts | | | robust_nli_li_ts | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='pietrolesci/robust_nli_li_ts', config_name=None) | +| add_one_rte | pietrolesci/add_one_rte | | | add_one_rte | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train', 'dev', 'test'], dataset_name='pietrolesci/add_one_rte', config_name=None) | +| hlgd | hlgd | | | hlgd | Classification | Classification(sentence1='headline_a', sentence2='headline_b', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| paws/labeled_final | paws | labeled_final | | paws___labeled_final | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| paws/labeled_swap | paws | labeled_swap | | paws___labeled_swap | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| paws/unlabeled_final | paws | unlabeled_final | | paws___unlabeled_final | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| quora | quora | | | quora | Classification | Classification(sentence1=, sentence2=, labels='is_duplicate', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| medical_questions_pairs | medical_questions_pairs | | | medical_questions_pairs | Classification | Classification(sentence1='question_1', sentence2='question_2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| conll2003/pos_tags | conll2003 | | pos_tags | conll2003__pos_tags | TokenClassification | TokenClassification(tokens='tokens', labels='pos_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| conll2003/chunk_tags | conll2003 | | chunk_tags | conll2003__chunk_tags | TokenClassification | TokenClassification(tokens='tokens', labels='chunk_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| conll2003/ner_tags | conll2003 | | ner_tags | conll2003__ner_tags | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| hh-rlhf | Anthropic/hh-rlhf | | | anthropic_rlhf | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['chosen', 'rejected'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name='Anthropic/hh-rlhf', config_name=None) | +| model-written-evals | Anthropic/model-written-evals | | | model_written_evals | MultipleChoice | MultipleChoice(inputs='question', choices=['answer_matching_behavior', 'answer_not_matching_behavior'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name='Anthropic/model-written-evals', config_name=None) | +| bigbench/ruin_names | bigbench | ruin_names | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/geometric_shapes | bigbench | geometric_shapes | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/logical_sequence | bigbench | logical_sequence | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/logic_grid_puzzle | bigbench | logic_grid_puzzle | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/nonsense_words_grammar | bigbench | nonsense_words_grammar | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/undo_permutation | bigbench | undo_permutation | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/code_line_description | bigbench | code_line_description | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/analytic_entailment | bigbench | analytic_entailment | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/logical_deduction | bigbench | logical_deduction | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/identify_odd_metaphor | bigbench | identify_odd_metaphor | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/logical_args | bigbench | logical_args | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/unit_interpretation | bigbench | unit_interpretation | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/snarks | bigbench | snarks | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/international_phonetic_alphabet_nli | bigbench | international_phonetic_alphabet_nli | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/suicide_risk | bigbench | suicide_risk | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/intersect_geometry | bigbench | intersect_geometry | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/cs_algorithms | bigbench | cs_algorithms | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/fantasy_reasoning | bigbench | fantasy_reasoning | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/riddle_sense | bigbench | riddle_sense | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/vitaminc_fact_verification | bigbench | vitaminc_fact_verification | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/metaphor_boolean | bigbench | metaphor_boolean | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/formal_fallacies_syllogisms_negation | bigbench | formal_fallacies_syllogisms_negation | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/implicatures | bigbench | implicatures | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/key_value_maps | bigbench | key_value_maps | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/disambiguation_qa | bigbench | disambiguation_qa | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/human_organs_senses | bigbench | human_organs_senses | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/timedial | bigbench | timedial | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/analogical_similarity | bigbench | analogical_similarity | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/temporal_sequences | bigbench | temporal_sequences | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/salient_translation_error_detection | bigbench | salient_translation_error_detection | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/strategyqa | bigbench | strategyqa | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/emoji_movie | bigbench | emoji_movie | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/play_dialog_same_or_different | bigbench | play_dialog_same_or_different | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/simple_ethical_questions | bigbench | simple_ethical_questions | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/novel_concepts | bigbench | novel_concepts | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/contextual_parametric_knowledge_conflicts | bigbench | contextual_parametric_knowledge_conflicts | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/intent_recognition | bigbench | intent_recognition | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/mathematical_induction | bigbench | mathematical_induction | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/hindu_knowledge | bigbench | hindu_knowledge | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/gre_reading_comprehension | bigbench | gre_reading_comprehension | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/symbol_interpretation | bigbench | symbol_interpretation | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/figure_of_speech_detection | bigbench | figure_of_speech_detection | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/causal_judgment | bigbench | causal_judgment | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/winowhy | bigbench | winowhy | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/cifar10_classification | bigbench | cifar10_classification | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/strange_stories | bigbench | strange_stories | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/moral_permissibility | bigbench | moral_permissibility | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/evaluating_information_essentiality | bigbench | evaluating_information_essentiality | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/authorship_verification | bigbench | authorship_verification | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/checkmate_in_one | bigbench | checkmate_in_one | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/known_unknowns | bigbench | known_unknowns | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/irony_identification | bigbench | irony_identification | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/date_understanding | bigbench | date_understanding | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/misconceptions | bigbench | misconceptions | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/color | bigbench | color | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/identify_math_theorems | bigbench | identify_math_theorems | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/empirical_judgments | bigbench | empirical_judgments | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/bbq_lite_json | bigbench | bbq_lite_json | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/arithmetic | bigbench | arithmetic | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/hhh_alignment | bigbench | hhh_alignment | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/entailed_polarity | bigbench | entailed_polarity | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/abstract_narrative_understanding | bigbench | abstract_narrative_understanding | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/discourse_marker_prediction | bigbench | discourse_marker_prediction | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/conceptual_combinations | bigbench | conceptual_combinations | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/crass_ai | bigbench | crass_ai | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/sports_understanding | bigbench | sports_understanding | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/understanding_fables | bigbench | understanding_fables | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/goal_step_wikihow | bigbench | goal_step_wikihow | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/english_proverbs | bigbench | english_proverbs | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/question_selection | bigbench | question_selection | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/emojis_emotion_prediction | bigbench | emojis_emotion_prediction | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/logical_fallacy_detection | bigbench | logical_fallacy_detection | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/physical_intuition | bigbench | physical_intuition | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/sentence_ambiguity | bigbench | sentence_ambiguity | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/tracking_shuffled_objects | bigbench | tracking_shuffled_objects | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/crash_blossom | bigbench | crash_blossom | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/movie_dialog_same_or_different | bigbench | movie_dialog_same_or_different | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/navigate | bigbench | navigate | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/fact_checker | bigbench | fact_checker | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/general_knowledge | bigbench | general_knowledge | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/presuppositions_as_nli | bigbench | presuppositions_as_nli | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/hyperbaton | bigbench | hyperbaton | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/mnist_ascii | bigbench | mnist_ascii | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/epistemic_reasoning | bigbench | epistemic_reasoning | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/odd_one_out | bigbench | odd_one_out | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/elementary_math_qa | bigbench | elementary_math_qa | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/similarities_abstraction | bigbench | similarities_abstraction | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/phrase_relatedness | bigbench | phrase_relatedness | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/penguins_in_a_table | bigbench | penguins_in_a_table | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/metaphor_understanding | bigbench | metaphor_understanding | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/social_iqa | bigbench | social_iqa | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/movie_recommendation | bigbench | movie_recommendation | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/physics | bigbench | physics | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/dark_humor_detection | bigbench | dark_humor_detection | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/implicit_relations | bigbench | implicit_relations | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/real_or_fake_text | bigbench | real_or_fake_text | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/reasoning_about_colored_objects | bigbench | reasoning_about_colored_objects | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/cause_and_effect | bigbench | cause_and_effect | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/social_support | bigbench | social_support | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/dyck_languages | bigbench | dyck_languages | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| bigbench/anachronisms | bigbench | anachronisms | | bigbench | MultipleChoice | MultipleChoice(inputs='inputs', choices=(), labels= at 0x7f5af07553a0>, choices_list='multiple_choice_targets', splits=('train', 'validation', 'test'), dataset_name=None, config_name={'date_understanding', 'misconceptions', 'color', 'identify_math_theorems', 'empirical_judgments', 'bbq_lite_json', 'arithmetic', 'hhh_alignment', 'entailed_polarity', 'abstract_narrative_understanding', 'discourse_marker_prediction', 'conceptual_combinations', 'crass_ai', 'sports_understanding', 'understanding_fables', 'goal_step_wikihow', 'english_proverbs', 'question_selection', 'emojis_emotion_prediction', 'irony_identification', 'known_unknowns', 'checkmate_in_one', 'hindu_knowledge', 'strategyqa', 'emoji_movie', 'play_dialog_same_or_different', 'simple_ethical_questions', 'novel_concepts', 'contextual_parametric_knowledge_conflicts', 'intent_recognition', 'mathematical_induction', 'gre_reading_comprehension', 'authorship_verification', 'symbol_interpretation', 'figure_of_speech_detection', 'causal_judgment', 'winowhy', 'cifar10_classification', 'strange_stories', 'moral_permissibility', 'evaluating_information_essentiality', 'logical_fallacy_detection', 'sentence_ambiguity', 'crash_blossom', 'similarities_abstraction', 'anachronisms', 'movie_dialog_same_or_different', 'navigate', 'fact_checker', 'general_knowledge', 'presuppositions_as_nli', 'hyperbaton', 'mnist_ascii', 'epistemic_reasoning', 'odd_one_out', 'elementary_math_qa', 'phrase_relatedness', 'dyck_languages', 'penguins_in_a_table', 'metaphor_understanding', 'social_iqa', 'movie_recommendation', 'physics', 'dark_humor_detection', 'implicit_relations', 'real_or_fake_text', 'reasoning_about_colored_objects', 'cause_and_effect', 'social_support', 'salient_translation_error_detection', 'physical_intuition', 'analogical_similarity', 'human_organs_senses', 'ruin_names', 'geometric_shapes', 'logical_sequence', 'logic_grid_puzzle', 'nonsense_words_grammar', 'tracking_shuffled_objects', 'undo_permutation', 'code_line_description', 'analytic_entailment', 'logical_deduction', 'identify_odd_metaphor', 'logical_args', 'unit_interpretation', 'snarks', 'timedial', 'suicide_risk', 'intersect_geometry', 'cs_algorithms', 'fantasy_reasoning', 'riddle_sense', 'vitaminc_fact_verification', 'metaphor_boolean', 'temporal_sequences', 'formal_fallacies_syllogisms_negation', 'implicatures', 'key_value_maps', 'disambiguation_qa', 'international_phonetic_alphabet_nli'}) | +| blimp/left_branch_island_echo_question/hard | blimp | left_branch_island_echo_question | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/wh_questions_subject_gap_long_distance/hard | blimp | wh_questions_subject_gap_long_distance | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/principle_A_domain_2/hard | blimp | principle_A_domain_2 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/coordinate_structure_constraint_complex_left_branch/hard | blimp | coordinate_structure_constraint_complex_left_branch | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/complex_NP_island/hard | blimp | complex_NP_island | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/wh_vs_that_with_gap/hard | blimp | wh_vs_that_with_gap | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/wh_vs_that_with_gap_long_distance/hard | blimp | wh_vs_that_with_gap_long_distance | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/matrix_question_npi_licensor_present/hard | blimp | matrix_question_npi_licensor_present | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/drop_argument/hard | blimp | drop_argument | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/sentential_subject_island/hard | blimp | sentential_subject_island | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/tough_vs_raising_1/hard | blimp | tough_vs_raising_1 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/existential_there_quantifiers_2/hard | blimp | existential_there_quantifiers_2 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/npi_present_2/hard | blimp | npi_present_2 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/animate_subject_passive/hard | blimp | animate_subject_passive | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/superlative_quantifiers_1/hard | blimp | superlative_quantifiers_1 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/npi_present_1/hard | blimp | npi_present_1 | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/principle_A_reconstruction/hard | blimp | principle_A_reconstruction | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/wh_questions_object_gap/hard | blimp | wh_questions_object_gap | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/principle_A_c_command/hard | blimp | principle_A_c_command | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/coordinate_structure_constraint_object_extraction/hard | blimp | coordinate_structure_constraint_object_extraction | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/inchoative/hard | blimp | inchoative | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| blimp/sentential_negation_npi_scope/hard | blimp | sentential_negation_npi_scope | hard | blimp__hard | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['sentence_good', 'sentence_bad'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name={'wh_vs_that_with_gap', 'principle_A_c_command', 'principle_A_reconstruction', 'npi_present_1', 'superlative_quantifiers_1', 'animate_subject_passive', 'inchoative', 'npi_present_2', 'tough_vs_raising_1', 'sentential_subject_island', 'drop_argument', 'matrix_question_npi_licensor_present', 'wh_vs_that_with_gap_long_distance', 'principle_A_domain_2', 'complex_NP_island', 'left_branch_island_echo_question', 'wh_questions_subject_gap_long_distance', 'coordinate_structure_constraint_object_extraction', 'wh_questions_object_gap', 'coordinate_structure_constraint_complex_left_branch', 'existential_there_quantifiers_2', 'sentential_negation_npi_scope'}) | +| cos_e/v1.0 | cos_e | v1.0 | | cos_e | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af0755550>, choices_list='choices', splits=('train', 'validation', 'test'), dataset_name=None, config_name='v1.0') | +| cosmos_qa | cosmos_qa | | | cosmos_qa | MultipleChoice | MultipleChoice(inputs=cat(fields=['context', 'question'], separator=' '), choices=['answer0', 'answer1', 'answer2', 'answer3'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| dream | dream | | | dream | MultipleChoice | MultipleChoice(inputs= at 0x7f5af07555e0>, choices=(), labels= at 0x7f5af0755670>, choices_list='choice', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| openbookqa | openbookqa | | | openbookqa | MultipleChoice | MultipleChoice(inputs='question_stem', choices=(), labels='answerKey', choices_list=, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| qasc | qasc | | | qasc | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af0755700>, choices_list=, splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| quartz | quartz | | | quartz | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answerKey', choices_list=, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| quail | quail | | | quail | MultipleChoice | MultipleChoice(inputs=cat(fields=['context', 'question'], separator=' '), choices=(), labels='correct_answer_id', choices_list='answers', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| sciq | sciq | | | sciq | MultipleChoice | MultipleChoice(inputs='question', choices=['correct_answer', 'distractor1', 'distractor2', 'distractor3'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| social_i_qa | social_i_qa | | | social_i_qa | MultipleChoice | MultipleChoice(inputs='question', choices=['answerA', 'answerB', 'answerC'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| wiki_hop | wiki_hop | | | wiki_hop | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af0755820>, choices_list='candidates', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| wiqa | wiqa | | | wiqa | MultipleChoice | MultipleChoice(inputs='question_stem', choices=(), labels='answer_label_as_choice', choices_list= at 0x7f5af07558b0>, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| piqa | piqa | | | piqa | MultipleChoice | MultipleChoice(inputs='goal', choices=['sol1', 'sol2'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| hellaswag | hellaswag | | | hellaswag | MultipleChoice | MultipleChoice(inputs='ctx_a', choices=(), labels='label', choices_list= at 0x7f5af0755940>, splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| super_glue/copa | super_glue | copa | | super_glue___copa | MultipleChoice | MultipleChoice(inputs='premise', choices=['choice1', 'choice2'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| art | art | | | art | MultipleChoice | MultipleChoice(inputs=cat(fields=['hypothesis_1', 'hypothesis_2'], separator=' '), choices=['observation_1', 'observation_2'], labels= at 0x7f5af07559d0>, choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| hendrycks_test/security_studies | hendrycks_test | security_studies | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_psychology | hendrycks_test | high_school_psychology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_statistics | hendrycks_test | high_school_statistics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_us_history | hendrycks_test | high_school_us_history | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_world_history | hendrycks_test | high_school_world_history | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/human_aging | hendrycks_test | human_aging | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/human_sexuality | hendrycks_test | human_sexuality | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/international_law | hendrycks_test | international_law | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/jurisprudence | hendrycks_test | jurisprudence | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/logical_fallacies | hendrycks_test | logical_fallacies | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/machine_learning | hendrycks_test | machine_learning | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/management | hendrycks_test | management | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/marketing | hendrycks_test | marketing | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/medical_genetics | hendrycks_test | medical_genetics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/miscellaneous | hendrycks_test | miscellaneous | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/moral_disputes | hendrycks_test | moral_disputes | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/moral_scenarios | hendrycks_test | moral_scenarios | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/nutrition | hendrycks_test | nutrition | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/philosophy | hendrycks_test | philosophy | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/prehistory | hendrycks_test | prehistory | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/professional_accounting | hendrycks_test | professional_accounting | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/professional_law | hendrycks_test | professional_law | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/professional_medicine | hendrycks_test | professional_medicine | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/professional_psychology | hendrycks_test | professional_psychology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_physics | hendrycks_test | high_school_physics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/world_religions | hendrycks_test | world_religions | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/virology | hendrycks_test | virology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/us_foreign_policy | hendrycks_test | us_foreign_policy | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/sociology | hendrycks_test | sociology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/public_relations | hendrycks_test | public_relations | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_microeconomics | hendrycks_test | high_school_microeconomics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/astronomy | hendrycks_test | astronomy | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_macroeconomics | hendrycks_test | high_school_macroeconomics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_mathematics | hendrycks_test | high_school_mathematics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/abstract_algebra | hendrycks_test | abstract_algebra | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/anatomy | hendrycks_test | anatomy | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/business_ethics | hendrycks_test | business_ethics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/clinical_knowledge | hendrycks_test | clinical_knowledge | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_biology | hendrycks_test | college_biology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_computer_science | hendrycks_test | college_computer_science | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_mathematics | hendrycks_test | college_mathematics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_medicine | hendrycks_test | college_medicine | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_physics | hendrycks_test | college_physics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/computer_security | hendrycks_test | computer_security | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/college_chemistry | hendrycks_test | college_chemistry | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/econometrics | hendrycks_test | econometrics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/electrical_engineering | hendrycks_test | electrical_engineering | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/elementary_mathematics | hendrycks_test | elementary_mathematics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/formal_logic | hendrycks_test | formal_logic | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/global_facts | hendrycks_test | global_facts | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_government_and_politics | hendrycks_test | high_school_government_and_politics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_biology | hendrycks_test | high_school_biology | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_chemistry | hendrycks_test | high_school_chemistry | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_computer_science | hendrycks_test | high_school_computer_science | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_european_history | hendrycks_test | high_school_european_history | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/high_school_geography | hendrycks_test | high_school_geography | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| hendrycks_test/conceptual_physics | hendrycks_test | conceptual_physics | | hendrycks_test | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels='answer', choices_list='choices', splits=['test', 'dev', 'validation'], dataset_name=None, config_name=['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']) | +| winogrande/winogrande_xl | winogrande | winogrande_xl | | winogrande | MultipleChoice | MultipleChoice(inputs='sentence', choices=['option1', 'option2'], labels='answer', choices_list=None, splits=['train', 'validation', None], dataset_name=None, config_name='winogrande_xl') | +| codah/codah | codah | codah | | codah | MultipleChoice | MultipleChoice(inputs='question_propmt', choices=(), labels='correct_answer_idx', choices_list='candidate_answers', splits=('train', 'validation', 'test'), dataset_name=None, config_name='codah') | +| ai2_arc/ARC-Easy/challenge | ai2_arc | ARC-Easy | challenge | ai2_arc__challenge | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af0755af0>, choices_list=, splits=('train', 'validation', 'test'), dataset_name=None, config_name=['ARC-Challenge', 'ARC-Easy']) | +| ai2_arc/ARC-Challenge/challenge | ai2_arc | ARC-Challenge | challenge | ai2_arc__challenge | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af0755af0>, choices_list=, splits=('train', 'validation', 'test'), dataset_name=None, config_name=['ARC-Challenge', 'ARC-Easy']) | +| definite_pronoun_resolution | definite_pronoun_resolution | | | definite_pronoun_resolution | MultipleChoice | MultipleChoice(inputs=cat(fields=['sentence', 'pronoun'], separator=' : '), choices=(), labels='label', choices_list='candidates', splits=['train', None, 'test'], dataset_name=None, config_name=None) | +| swag | swag | | | swag | MultipleChoice | MultipleChoice(inputs=cat(fields=['sent1', 'sent2'], separator=' '), choices=['ending0', 'ending1', 'ending2', 'ending3'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| math_qa | math_qa | | | math_qa | MultipleChoice | MultipleChoice(inputs='Problem', choices=(), labels= at 0x7f5af0755ee0>, choices_list= at 0x7f5af0755e50>, splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| trec | trec | | | trec | Classification | Classification(sentence1='text', sentence2='sentence2', labels='fine_label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| vitaminc/tals--vitaminc | tals/vitaminc | tals--vitaminc | | tals_vitaminc | Classification | Classification(sentence1='claim', sentence2='evidence', labels='label', splits=('train', 'validation', 'test'), dataset_name='tals/vitaminc', config_name='tals--vitaminc') | +| hope_edi/english | hope_edi | english | | hope_edi | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', 'validation', None], dataset_name=None, config_name=['english']) | +| ethos/binary | ethos | binary | | ethos___binary | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| ethos/multilabel | ethos | multilabel | | ethos___multilabel | Classification | Classification(sentence1='text', sentence2='sentence2', labels= at 0x7f5af0755f70>, splits=['train', None, None], dataset_name=None, config_name=None) | +| discovery/discovery | discovery | discovery | | discovery | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['discovery']) | +| pragmeval/verifiability/single_input | pragmeval | verifiability | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/squinky-informativeness/single_input | pragmeval | squinky-informativeness | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/squinky-implicature/single_input | pragmeval | squinky-implicature | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/squinky-formality/single_input | pragmeval | squinky-formality | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/emobank-valence/single_input | pragmeval | emobank-valence | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/emobank-dominance/single_input | pragmeval | emobank-dominance | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/emobank-arousal/single_input | pragmeval | emobank-arousal | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/switchboard/single_input | pragmeval | switchboard | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/mrda/single_input | pragmeval | mrda | single_input | pragmeval__single_input | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emobank-arousal', 'emobank-dominance', 'emobank-valence', 'squinky-formality', 'squinky-implicature', 'squinky-informativeness', 'switchboard', 'mrda', 'verifiability']) | +| pragmeval/stac/pairs | pragmeval | stac | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-specificity/pairs | pragmeval | persuasiveness-specificity | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-relevance/pairs | pragmeval | persuasiveness-relevance | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/emergent/pairs | pragmeval | emergent | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/gum/pairs | pragmeval | gum | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/pdtb/pairs | pragmeval | pdtb | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-claimtype/pairs | pragmeval | persuasiveness-claimtype | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-strength/pairs | pragmeval | persuasiveness-strength | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/sarcasm/pairs | pragmeval | sarcasm | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-premisetype/pairs | pragmeval | persuasiveness-premisetype | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| pragmeval/persuasiveness-eloquence/pairs | pragmeval | persuasiveness-eloquence | pairs | pragmeval__pairs | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emergent', 'gum', 'pdtb', 'persuasiveness-claimtype', 'persuasiveness-eloquence', 'persuasiveness-premisetype', 'persuasiveness-relevance', 'persuasiveness-specificity', 'persuasiveness-strength', 'sarcasm', 'stac']) | +| glue/cola | glue | cola | | glue___cola | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/sst2 | glue | sst2 | | glue___sst2 | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/mrpc | glue | mrpc | | glue___mrpc | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/qqp | glue | qqp | | glue___qqp | Classification | Classification(sentence1='question1', sentence2='question2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/stsb | glue | stsb | | glue___stsb | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/mnli | glue | mnli | | glue___mnli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['train', 'validation_matched', None], dataset_name=None, config_name=None) | +| glue/qnli | glue | qnli | | glue___qnli | Classification | Classification(sentence1='question', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/rte | glue | rte | | glue___rte | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| glue/wnli | glue | wnli | | glue___wnli | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| super_glue/boolq | super_glue | boolq | | super_glue___boolq | Classification | Classification(sentence1='question', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| super_glue/cb | super_glue | cb | | super_glue___cb | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| super_glue/multirc | super_glue | multirc | | super_glue___multirc | Classification | Classification(sentence1='question', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| super_glue/wic | super_glue | wic | | super_glue___wic | Classification | Classification(sentence1=cat(fields=['word', 'sentence1'], separator=' : '), sentence2=cat(fields=['word', 'sentence2'], separator=' : '), labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| super_glue/axg | super_glue | axg | | super_glue___axg | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['test', None, None], dataset_name=None, config_name=None) | +| tweet_eval/stance_abortion | tweet_eval | stance_abortion | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/hate | tweet_eval | hate | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/stance_feminist | tweet_eval | stance_feminist | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/stance_climate | tweet_eval | stance_climate | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/stance_atheism | tweet_eval | stance_atheism | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/stance_hillary | tweet_eval | stance_hillary | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/offensive | tweet_eval | offensive | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/sentiment | tweet_eval | sentiment | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/emoji | tweet_eval | emoji | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/irony | tweet_eval | irony | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| tweet_eval/emotion | tweet_eval | emotion | | tweet_eval | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']) | +| lex_glue/eurlex | lex_glue | eurlex | | lex_glue___eurlex | Classification | Classification(sentence1='text', sentence2='sentence2', labels='labels', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| lex_glue/scotus | lex_glue | scotus | | lex_glue___scotus | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| lex_glue/ledgar | lex_glue | ledgar | | lex_glue___ledgar | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| lex_glue/unfair_tos | lex_glue | unfair_tos | | lex_glue___unfair_tos | Classification | Classification(sentence1='text', sentence2='sentence2', labels='labels', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| lex_glue/case_hold | lex_glue | case_hold | | lex_glue___case_hold | MultipleChoice | MultipleChoice(inputs='context', choices=(), labels='label', choices_list='endings', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| imdb | imdb | | | imdb | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=None) | +| rotten_tomatoes | rotten_tomatoes | | | rotten_tomatoes | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| ag_news | ag_news | | | ag_news | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=None) | +| yelp_review_full/yelp_review_full | yelp_review_full | yelp_review_full | | yelp_review_full | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=['yelp_review_full']) | +| financial_phrasebank/sentences_allagree | financial_phrasebank | sentences_allagree | | financial_phrasebank | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']) | +| financial_phrasebank/sentences_75agree | financial_phrasebank | sentences_75agree | | financial_phrasebank | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']) | +| financial_phrasebank/sentences_66agree | financial_phrasebank | sentences_66agree | | financial_phrasebank | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']) | +| financial_phrasebank/sentences_50agree | financial_phrasebank | sentences_50agree | | financial_phrasebank | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']) | +| poem_sentiment | poem_sentiment | | | poem_sentiment | Classification | Classification(sentence1='verse_text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| dbpedia_14/dbpedia_14 | dbpedia_14 | dbpedia_14 | | dbpedia_14 | Classification | Classification(sentence1='content', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=['dbpedia_14']) | +| amazon_polarity/amazon_polarity | amazon_polarity | amazon_polarity | | amazon_polarity | Classification | Classification(sentence1='content', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=['amazon_polarity']) | +| app_reviews | app_reviews | | | app_reviews | Classification | Classification(sentence1='review', sentence2='sentence2', labels='star', splits=['train', None, None], dataset_name=None, config_name=None) | +| hate_speech18 | hate_speech18 | | | hate_speech18 | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| sms_spam | sms_spam | | | sms_spam | Classification | Classification(sentence1='sms', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| humicroedit/subtask-1 | humicroedit | subtask-1 | | humicroedit___subtask_1 | Classification | Classification(sentence1='original', sentence2='edit', labels='meanGrade', splits=('train', 'validation', 'test'), dataset_name='humicroedit', config_name='subtask-1') | +| humicroedit/subtask-2 | humicroedit | subtask-2 | | humicroedit___subtask_2 | Classification | Classification(sentence1=cat(fields=['original1', 'edit1'], separator=' : '), sentence2=cat(fields=['original2', 'edit2'], separator=' : '), labels='label', splits=('train', 'validation', 'test'), dataset_name='humicroedit', config_name='subtask-2') | +| snips_built_in_intents | snips_built_in_intents | | | snips_built_in_intents | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| banking77 | banking77 | | | banking77 | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=None) | +| hate_speech_offensive | hate_speech_offensive | | | hate_speech_offensive | Classification | Classification(sentence1='tweet', sentence2='sentence2', labels='class', splits=['train', None, None], dataset_name=None, config_name=None) | +| hyperpartisan_news_detection/byarticle | hyperpartisan_news_detection | byarticle | | hyperpartisan_news_detection___byarticle | Classification | Classification(sentence1='text', sentence2='sentence2', labels='hyperpartisan', splits=['train', None, None], dataset_name=None, config_name=None) | +| hyperpartisan_news_detection/bypublisher | hyperpartisan_news_detection | bypublisher | | hyperpartisan_news_detection___bypublisher | Classification | Classification(sentence1='text', sentence2='sentence2', labels='hyperpartisan', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| go_emotions/simplified | go_emotions | simplified | | go_emotions___simplified | Classification | Classification(sentence1='text', sentence2='sentence2', labels='labels', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| scicite | scicite | | | scicite | Classification | Classification(sentence1='string', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| liar | liar | | | liar | Classification | Classification(sentence1='statement', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| lexical_relation_classification/CogALexV | relbert/lexical_relation_classification | CogALexV | | relbert_lexical_relation_classification | Classification | Classification(sentence1='head', sentence2='tail', labels='relation', splits=('train', 'validation', 'test'), dataset_name='relbert/lexical_relation_classification', config_name=['BLESS', 'CogALexV', 'EVALution', 'K&H+N', 'ROOT09']) | +| lexical_relation_classification/EVALution | relbert/lexical_relation_classification | EVALution | | relbert_lexical_relation_classification | Classification | Classification(sentence1='head', sentence2='tail', labels='relation', splits=('train', 'validation', 'test'), dataset_name='relbert/lexical_relation_classification', config_name=['BLESS', 'CogALexV', 'EVALution', 'K&H+N', 'ROOT09']) | +| lexical_relation_classification/K&H+N | relbert/lexical_relation_classification | K&H+N | | relbert_lexical_relation_classification | Classification | Classification(sentence1='head', sentence2='tail', labels='relation', splits=('train', 'validation', 'test'), dataset_name='relbert/lexical_relation_classification', config_name=['BLESS', 'CogALexV', 'EVALution', 'K&H+N', 'ROOT09']) | +| lexical_relation_classification/ROOT09 | relbert/lexical_relation_classification | ROOT09 | | relbert_lexical_relation_classification | Classification | Classification(sentence1='head', sentence2='tail', labels='relation', splits=('train', 'validation', 'test'), dataset_name='relbert/lexical_relation_classification', config_name=['BLESS', 'CogALexV', 'EVALution', 'K&H+N', 'ROOT09']) | +| lexical_relation_classification/BLESS | relbert/lexical_relation_classification | BLESS | | relbert_lexical_relation_classification | Classification | Classification(sentence1='head', sentence2='tail', labels='relation', splits=('train', 'validation', 'test'), dataset_name='relbert/lexical_relation_classification', config_name=['BLESS', 'CogALexV', 'EVALution', 'K&H+N', 'ROOT09']) | +| linguisticprobing/tree_depth | metaeval/linguisticprobing | tree_depth | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/coordination_inversion | metaeval/linguisticprobing | coordination_inversion | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/odd_man_out | metaeval/linguisticprobing | odd_man_out | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/word_content | metaeval/linguisticprobing | word_content | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/bigram_shift | metaeval/linguisticprobing | bigram_shift | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/top_constituents | metaeval/linguisticprobing | top_constituents | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/sentence_length | metaeval/linguisticprobing | sentence_length | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/subj_number | metaeval/linguisticprobing | subj_number | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/past_present | metaeval/linguisticprobing | past_present | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| linguisticprobing/obj_number | metaeval/linguisticprobing | obj_number | | metaeval_linguisticprobing | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/linguisticprobing', config_name=['subj_number', 'word_content', 'obj_number', 'past_present', 'sentence_length', 'top_constituents', 'tree_depth', 'coordination_inversion', 'odd_man_out', 'bigram_shift']) | +| crowdflower/tweet_global_warming | metaeval/crowdflower | tweet_global_warming | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/airline-sentiment | metaeval/crowdflower | airline-sentiment | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/text_emotion | metaeval/crowdflower | text_emotion | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/political-media-message | metaeval/crowdflower | political-media-message | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/corporate-messaging | metaeval/crowdflower | corporate-messaging | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/political-media-audience | metaeval/crowdflower | political-media-audience | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/economic-news | metaeval/crowdflower | economic-news | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/sentiment_nuclear_power | metaeval/crowdflower | sentiment_nuclear_power | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| crowdflower/political-media-bias | metaeval/crowdflower | political-media-bias | | metaeval_crowdflower | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name='metaeval/crowdflower', config_name=['sentiment_nuclear_power', 'tweet_global_warming', 'airline-sentiment', 'corporate-messaging', 'economic-news', 'political-media-audience', 'political-media-bias', 'political-media-message', 'text_emotion']) | +| ethics/commonsense | metaeval/ethics | commonsense | | metaeval_ethics___commonsense | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/ethics', config_name='commonsense') | +| ethics/deontology | metaeval/ethics | deontology | | metaeval_ethics___deontology | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/ethics', config_name='deontology') | +| ethics/justice | metaeval/ethics | justice | | metaeval_ethics___justice | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/ethics', config_name='justice') | +| ethics/utilitarianism | metaeval/ethics | utilitarianism | | metaeval_ethics___utilitarianism | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/ethics', config_name='utilitarianism') | +| ethics/virtue | metaeval/ethics | virtue | | metaeval_ethics___virtue | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='metaeval/ethics', config_name='virtue') | +| emo/emo2019 | emo | emo2019 | | emo | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=['train', None, 'test'], dataset_name=None, config_name=['emo2019']) | +| google_wellformed_query | google_wellformed_query | | | google_wellformed_query | Classification | Classification(sentence1='content', sentence2='sentence2', labels='rating', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| tweets_hate_speech_detection | tweets_hate_speech_detection | | | tweets_hate_speech_detection | Classification | Classification(sentence1='tweet', sentence2='sentence2', labels='label', splits=['train', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_sst2 | adv_glue | adv_sst2 | | adv_glue___adv_sst2 | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_qqp | adv_glue | adv_qqp | | adv_glue___adv_qqp | Classification | Classification(sentence1='question1', sentence2='question2', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_mnli | adv_glue | adv_mnli | | adv_glue___adv_mnli | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_mnli_mismatched | adv_glue | adv_mnli_mismatched | | adv_glue___adv_mnli_mismatched | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_qnli | adv_glue | adv_qnli | | adv_glue___adv_qnli | Classification | Classification(sentence1='question', sentence2='sentence2', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| adv_glue/adv_rte | adv_glue | adv_rte | | adv_glue___adv_rte | Classification | Classification(sentence1='sentence1', sentence2='sentence2', labels='label', splits=['validation', None, None], dataset_name=None, config_name=None) | +| has_part | has_part | | | has_part | Classification | Classification(sentence1='arg1', sentence2='arg2', labels='score', splits=['train', None, None], dataset_name=None, config_name=None) | +| wnut_17/wnut_17 | wnut_17 | wnut_17 | | wnut_17 | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['wnut_17']) | +| ncbi_disease/ncbi_disease | ncbi_disease | ncbi_disease | | ncbi_disease | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['ncbi_disease']) | +| acronym_identification | acronym_identification | | | acronym_identification | TokenClassification | TokenClassification(tokens='tokens', labels='labels', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| jnlpba/jnlpba | jnlpba | jnlpba | | jnlpba | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=['train', 'validation', None], dataset_name=None, config_name=['jnlpba']) | +| species_800/species_800 | species_800 | species_800 | | species_800 | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=('train', 'validation', 'test'), dataset_name=None, config_name=['species_800']) | +| ontonotes_english/SpeedOfMagic--ontonotes_english | SpeedOfMagic/ontonotes_english | SpeedOfMagic--ontonotes_english | | SpeedOfMagic_ontonotes_english | TokenClassification | TokenClassification(tokens='tokens', labels='ner_tags', splits=('train', 'validation', 'test'), dataset_name='SpeedOfMagic/ontonotes_english', config_name='SpeedOfMagic--ontonotes_english') | +| blog_authorship_corpus/gender | blog_authorship_corpus | | gender | blog_authorship_corpus__gender | Classification | Classification(sentence1='text', sentence2='sentence2', labels='gender', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| blog_authorship_corpus/age | blog_authorship_corpus | | age | blog_authorship_corpus__age | Classification | Classification(sentence1='text', sentence2='sentence2', labels='age', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| blog_authorship_corpus/horoscope | blog_authorship_corpus | | horoscope | blog_authorship_corpus__horoscope | Classification | Classification(sentence1='text', sentence2='sentence2', labels='horoscope', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| blog_authorship_corpus/job | blog_authorship_corpus | | job | blog_authorship_corpus__job | Classification | Classification(sentence1='text', sentence2='sentence2', labels='job', splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| open_question_type | launch/open_question_type | | | launch_open_question_type | Classification | Classification(sentence1='question', sentence2='sentence2', labels='resolve_type', splits=('train', 'validation', 'test'), dataset_name='launch/open_question_type', config_name=None) | +| health_fact | health_fact | | | health_fact | Classification | Classification(sentence1='claim', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| commonsense_qa | commonsense_qa | | | commonsense_qa | MultipleChoice | MultipleChoice(inputs='question', choices=(), labels= at 0x7f5af046a040>, choices_list=, splits=['train', 'validation', None], dataset_name=None, config_name=None) | +| mc_taco | mc_taco | | | mc_taco | Classification | Classification(sentence1= at 0x7f5af046a0d0>, sentence2='sentence2', labels='label', splits=['validation', None, 'test'], dataset_name=None, config_name=None) | +| ade_corpus_v2/Ade_corpus_v2_classification | ade_corpus_v2 | Ade_corpus_v2_classification | | ade_corpus_v2___Ade_corpus_v2_classification | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| discosense | prajjwal1/discosense | | | discosense | MultipleChoice | MultipleChoice(inputs='context', choices=['option_0', 'option_1', 'option_2', 'option_3'], labels='label', choices_list=None, splits=('train', 'validation', 'test'), dataset_name='prajjwal1/discosense', config_name=None) | +| circa | circa | | | circa | Classification | Classification(sentence1=cat(fields=['context', 'question-X'], separator=' '), sentence2='answer-Y', labels='goldstandard2', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| code_x_glue_cc_defect_detection | code_x_glue_cc_defect_detection | | | code_x_glue_cc_defect_detection | Classification | Classification(sentence1='func', sentence2='sentence2', labels='target', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| code_x_glue_cc_clone_detection_big_clone_bench | code_x_glue_cc_clone_detection_big_clone_bench | | | code_x_glue_cc_clone_detection_big_clone_bench | Classification | Classification(sentence1='func1', sentence2='func2', labels='label', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| code_x_glue_cc_code_refinement/medium | code_x_glue_cc_code_refinement | medium | | code_x_glue_cc_code_refinement | MultipleChoice | MultipleChoice(inputs=constantly(), choices=['buggy', 'fixed'], labels=constantly(0), choices_list=None, splits=('train', 'validation', 'test'), dataset_name=None, config_name='medium') | +| EffectiveFeedbackStudentWriting | YaHi/EffectiveFeedbackStudentWriting | | | effective_feedback_student_writing | Classification | Classification(sentence1='discourse_text', sentence2='sentence2', labels='discourse_effectiveness', splits=('train', 'validation', 'test'), dataset_name='YaHi/EffectiveFeedbackStudentWriting', config_name=None) | +| promptSentiment | Ericwang/promptSentiment | | | promptSentiment | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptSentiment', config_name=None) | +| promptNLI | Ericwang/promptNLI | | | promptNLI | Classification | Classification(sentence1='premise', sentence2='hypothesis', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptNLI', config_name=None) | +| promptSpoke | Ericwang/promptSpoke | | | promptSpoke | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptSpoke', config_name=None) | +| promptProficiency | Ericwang/promptProficiency | | | promptProficiency | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptProficiency', config_name=None) | +| promptGrammar | Ericwang/promptGrammar | | | promptGrammar | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptGrammar', config_name=None) | +| promptCoherence | Ericwang/promptCoherence | | | promptCoherence | Classification | Classification(sentence1='text', sentence2='sentence2', labels='label', splits=('train', 'validation', 'test'), dataset_name='Ericwang/promptCoherence', config_name=None) | +| phrase_similarity | PiC/phrase_similarity | | | phrase_similarity | Classification | Classification(sentence1=cat(fields=['phrase1', 'sentence1'], separator=' : '), sentence2=cat(fields=['phrase2', 'sentence2'], separator=' : '), labels='label', splits=('train', 'validation', 'test'), dataset_name='PiC/phrase_similarity', config_name=None) | +| scientific-exaggeration-detection | copenlu/scientific-exaggeration-detection | | | exaggeration_detection | Classification | Classification(sentence1='press_release_conclusion', sentence2='abstract_conclusion', labels='exaggeration_label', splits=('train', 'validation', 'test'), dataset_name='copenlu/scientific-exaggeration-detection', config_name=None) | +| quarel | quarel | | | quarel | Classification | Classification(sentence1='question', sentence2='sentence2', labels='answer_index', splits=('train', 'validation', 'test'), dataset_name=None, config_name=None) | +| fever-evidence-related/mwong--fever-related | mwong/fever-evidence-related | mwong--fever-related | | mwong_fever_evidence_related | Classification | Classification(sentence1='claim', sentence2='evidence', labels='labels', splits=['train', 'valid', 'test'], dataset_name='mwong/fever-evidence-related', config_name='mwong--fever-related') | +| numer_sense | numer_sense | | | numer_sense | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='target', splits=['train', None, None], dataset_name=None, config_name=None) | +| dynasent/dynabench.dynasent.r1.all/r1 | dynabench/dynasent | dynabench.dynasent.r1.all | r1 | dynasent__r1 | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='gold_label', splits=('train', 'validation', 'test'), dataset_name='dynabench/dynasent', config_name='dynabench.dynasent.r1.all') | +| dynasent/dynabench.dynasent.r2.all/r2 | dynabench/dynasent | dynabench.dynasent.r2.all | r2 | dynasent__r2 | Classification | Classification(sentence1='sentence', sentence2='sentence2', labels='gold_label', splits=('train', 'validation', 'test'), dataset_name='dynabench/dynasent', config_name='dynabench.dynasent.r2.all') | +| Sarcasm_News_Headline | raquiba/Sarcasm_News_Headline | | | sarcasm_news | Classification | Classification(sentence1='headline', sentence2='sentence2', labels='is_sarcastic', splits=('train', 'validation', 'test'), dataset_name='raquiba/Sarcasm_News_Headline', config_name=None) | \ No newline at end of file