add seqio mixtures for bias/fairness eval (#530)

* add seqio mixtures for bias/fairness eval * style
bigscience-workshop · Oct 12, 2021 · 420e8ed · 420e8ed
1 parent 80145e5
commit 420e8ed
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 17 deletions.
diff --git a/promptsource/seqio_tasks/experiment_D4.csv b/promptsource/seqio_tasks/experiment_D4.csv
@@ -1,9 +1,12 @@
 HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
 crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
 jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-super_glue,axg,bias_and_fairness,cls,test set only,,,,TRUE,,,,,,,,,,,,,,,,
+super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
 winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-winobias,,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
 super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
 winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
 winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
@@ -17,11 +20,11 @@ super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
 glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
 glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
 paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
-ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
+ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
 (i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
-ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
+ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
 (i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
 nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
 kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
@@ -31,13 +34,13 @@ wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclantholo
 adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
 adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
 adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
-coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
-against n human answers resulting in n F1 scores,
-the maximum of which is chosen as the prediction’s
-F1.For each question, we average out F1 across
-these n sets, both for humans and models. In our
-final evaluation, we use n = 4 human answers for
-every question (the original answer and 3 additionally collected answers). The articles a, an and the
+coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
+against n human answers resulting in n F1 scores,
+the maximum of which is chosen as the prediction’s
+F1.For each question, we average out F1 across
+these n sets, both for humans and models. In our
+final evaluation, we use n = 4 human answers for
+every question (the original answer and 3 additionally collected answers). The articles a, an and the
 and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
 duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
 duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
@@ -55,8 +58,8 @@ drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even
 cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
 cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
 dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
-openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
+openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
 (i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
 qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
 quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
@@ -235,4 +238,4 @@ glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-g
 ,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
 ,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
 ,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
+,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
diff --git a/promptsource/seqio_tasks/tasks.py b/promptsource/seqio_tasks/tasks.py
@@ -144,6 +144,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 d4_eval: List[datatset_subset_tuple] = []
 d3_train_gpt: List[datatset_subset_tuple] = []
 d3_train_sglue: List[datatset_subset_tuple] = []
+bias_fairness_eval: List[datatset_subset_tuple] = []
 gsheet: Dict[datatset_subset_tuple, Dict] = {}
 experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
 with open(experiment_path) as exp_file:
@@ -162,8 +163,14 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             d3_train_gpt.append(dataset_subset)
         if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
             d3_train_sglue.append(dataset_subset)
+        if (
+            row["do_eval"] == "TRUE"
+            and row["task_by_convention"] == "bias_and_fairness"
+            and row["HF_name"] != "winogender"
+        ):
+            bias_fairness_eval.append(dataset_subset)
         gsheet[dataset_subset] = row
-all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue
+all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
@@ -173,6 +180,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 gpt_train_mixture: List[str] = []
 sglue_train_mixture: List[str] = []
 d4_eval_mixture: List[str] = []
+bias_fairness_eval_mixture: List[str] = []
 mixture_cap: Dict[str, int] = {}
 single_original_task: Dict[Tuple[str, str], str] = {}
 all_original_tasks: List[str] = []
@@ -218,6 +226,8 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             if template.metadata.original_task:
                 d4_eval_mixture.append(task_name)
             # TODO use template.metadata.answer_choices or answer_choice_keys here for rank eval
+        if (dataset_name, subset_name) in bias_fairness_eval:
+            bias_fairness_eval_mixture.append(task_name)
 
 # Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
 dataset_name, subset_name = ("anli", None)
@@ -393,3 +403,19 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     [task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
+
+seqio.MixtureRegistry.add(
+    "bias_fairness_eval",
+    bias_fairness_eval_mixture,
+    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+)
+
+seqio.MixtureRegistry.add(
+    "bias_fairness_eval_score_eval",
+    [
+        task
+        for task in seqio.TaskRegistry.names()
+        if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture
+    ],
+    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+)