add scripts

raishish · Jan 27, 2024 · 2336328 · 2336328
1 parent 77093ec
commit 2336328
Show file tree

Hide file tree

Showing 12 changed files with 324 additions and 38 deletions.
diff --git a/evaluation/plot_accuracy_individual.py b/evaluation/plot_accuracy_individual.py
@@ -19,7 +19,6 @@
 import pandas as pd
 from scipy import stats
 import numpy as np
-import plotnine
 import matplotlib.pyplot as plt
 
 sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))

diff --git a/model/data_loader.py b/model/data_loader.py
@@ -1,46 +1,52 @@
 from enum import Enum
 import numpy as np
 from prompt import Normal, MemoriazationNoChoices, MemoriazationRepeatGoldChoices, MemoriazationEmptyChoices, ArtifactChoices, ArtifactChoicesQuestionCOT, TwoChoices, ThreeChoices, ShuffleChoices, ChoiceA, ChoiceB, ChoiceC, ChoiceD, ChoiceAQuestion, ChoiceBQuestion, ChoiceCQuestion, ChoiceDQuestion
-import contextlib
 from datasets.utils.logging import disable_progress_bar
 disable_progress_bar()
 
 class PromptType(Enum):
-    normal = 'normal'
-    memorization_no_choice = 'memorization_no_choice'
-    memorization_gold = 'memorization_gold'
-    memorization_empty = 'memorization_empty'
-    artifact_choices = 'artifact_choices'
-    artifact_choices_cot = 'artifact_choices_cot'
-    two_choices = 'two_choices'
-    three_choices = 'three_choices'
-    shuffle_choices = 'shuffle_choices'
-    choice_a = 'choice_a'
-    choice_b = 'choice_b'
-    choice_c = 'choice_c'
-    choice_d = 'choice_d'
-    choice_a_question = 'choice_a_question'
-    choice_b_question = 'choice_b_question'
-    choice_c_question = 'choice_c_question'
-    choice_d_question = 'choice_d_question'
-    choice_a_even = 'choice_a_even'
-    choice_b_even = 'choice_b_even'
-    choice_c_even = 'choice_c_even'
-    choice_d_even = 'choice_d_even'
-    choice_a_question_even = 'choice_a_question_even'
-    choice_b_question_even = 'choice_b_question_even'
-    choice_c_question_even = 'choice_c_question_even'
-    choice_d_question_even = 'choice_d_question_even'
-    artifact_choices_cot_twostep = 'artifact_choices_cot_twostep'
-    artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated'
-    artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random'
+    normal = 'normal' # Full MCQA Prompt
+    artifact_choices = 'artifact_choices' # Choices-Only Prompt
+
+    memorization_no_choice = 'memorization_no_choice' # Memorization Prompt - no choices shown
+    memorization_gold = 'memorization_gold' # Memorization Prompt - all choices are the gold answer
+    memorization_empty = 'memorization_empty' # Memorization Prompt - all choices are empty
+
+    choice_a_even = 'choice_a_even' # Independently classify the correctness of each option A, without the question
+    choice_b_even = 'choice_b_even' # Independently classify the correctness of each option B, without the question
+    choice_c_even = 'choice_c_even' # Independently classify the correctness of each option C, without the question
+    choice_d_even = 'choice_d_even' # Independently classify the correctness of each option D, without the question
+
+    choice_a_question_even = 'choice_a_question_even' # Independently classify the correctness of each option A, with the question
+    choice_b_question_even = 'choice_b_question_even' # Independently classify the correctness of each option B, with the question
+    choice_c_question_even = 'choice_c_question_even' # Independently classify the correctness of each option C, with the question
+    choice_d_question_even = 'choice_d_question_even' # Independently classify the correctness of each option D, with the question
+
+    artifact_choices_cot = 'artifact_choices_cot' # Step 1 of Inferring the Question
+    artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated' # Step 2 of Inferring the Question
+    artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random' # Inferring the Question comparison with Random Question
+
+    two_choices = 'two_choices' # 2 Choices out of 4 (not in paper)
+    three_choices = 'three_choices' # 3 Choices out of 4 (not in paper)
+    shuffle_choices = 'shuffle_choices' # Shuffle the MC choices (not in paper)
+
+    choice_a = 'choice_a' # Independently classify the correctness of each option A, without the question (75/25 prior)
+    choice_b = 'choice_b' # Independently classify the correctness of each option B, without the question (75/25 prior)
+    choice_c = 'choice_c' # Independently classify the correctness of each option C, without the question (75/25 prior)
+    choice_d = 'choice_d' # Independently classify the correctness of each option D, without the question (75/25 prior)
+
+    choice_a_question = 'choice_a_question_' # Independently classify the correctness of each option A, with the question (75/25 prior)
+    choice_b_question = 'choice_b_question' # Independently classify the correctness of each option B, with the question (75/25 prior)
+    choice_c_question = 'choice_c_question' # Independently classify the correctness of each option C, with the question (75/25 prior)
+    choice_d_question = 'choice_d_question' # Independently classify the correctness of each option D, with the question (75/25 prior)
+
 
 
 class DatasetName(Enum):
-    mmlu = 'mmlu'
-    HellaSwag = 'HellaSwag'
-    ARC = 'ARC'
-    Winogrande = 'Winogrande'
+    mmlu = 'mmlu' # MMLU
+    HellaSwag = 'HellaSwag' # HellaSwag
+    ARC = 'ARC' # ARC
+    Winogrande = 'Winogrande' # Winogrande (not in paper)
 
 prompt_type_map = {
     PromptType.normal: Normal,

diff --git a/model/extract_generated_questions.py b/model/extract_generated_questions.py
@@ -0,0 +1,39 @@
+import pickle
+import datasets
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))
+
+# specify models, datasets, and the results directory
+res_dir = ...
+MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b']
+DATASETS = ['ARC', 'MMLU', 'HellaSwag']
+
+pt = 'artifact_choices_cot'
+
+for model_nickname in MODELS:
+    for dataset in DATASETS:
+
+        res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl'
+        out_dir = f'{res_dir}{dataset}/{model_nickname}/gen_question_data.pkl'
+        with open(res_dir, 'rb') as handle:
+            res = pickle.load(handle)
+
+        qs = []
+        cs = []
+        invalid_count = 0
+        for i, r in enumerate(res['raw_text']):
+            p = res['prompt'][i]
+            if r != None and 'Answer:' in r:
+                r_ = r[:r.index('Answer:')].strip()
+                qs.append(r_)
+                p_ = p.split('\n\n')[-1]
+                cs.append(p_.replace('Question:', '').strip())
+            else:
+                invalid_count += 1
+                qs.append(None)
+                cs.append(None)
+
+        out = {'questions': qs, 'choices': cs}
+        with open(out_dir, 'wb') as handle:
+            pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/model/extract_random_questions.py b/model/extract_random_questions.py
@@ -0,0 +1,60 @@
+import pickle
+import datasets
+import copy
+import random
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))
+from data_loader import create_data_evaluation, DatasetName
+
+# specify models, datasets, and the results directory
+res_dir = ...
+MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b']
+DATASETS = [DatasetName.ARC, DatasetName.HellaSwag, DatasetName.mmlu]
+
+
+pt = 'artifact_choices_cot'
+ds = datasets.load_dataset('nbalepur/mcqa_artifacts')
+
+def check_any_match(l1, l2):
+    for i in range(len(l1)):
+        if l1[i] == l2[i]:
+            return True
+    return False
+
+for dataset_name in DATASETS:
+
+    data = create_data_evaluation(ds, dataset_name)
+    qs = data['questions']
+    qs_copy = copy.deepcopy(qs)
+
+    while check_any_match(qs, qs_copy):
+        random.shuffle(qs_copy)
+
+    dataset = dataset_name.value
+
+    for model_nickname in MODELS:
+
+        res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl'
+        out_dir = f'{res_dir}{dataset}/{model_nickname}/random_question_data.pkl'
+        with open(res_dir, 'rb') as handle:
+            res = pickle.load(handle)
+
+        qs = []
+        cs = []
+        invalid_count = 0
+        for i, r in enumerate(res['raw_text']):
+            p = res['prompt'][i]
+            if r != None and 'Answer:' in r:
+                r_ = r[:r.index('Answer:')].strip()
+                qs.append(r_)
+                p_ = p.split('\n\n')[-1]
+                cs.append(p_.replace('Question:', '').strip())
+            else:
+                invalid_count += 1
+                qs.append(None)
+                cs.append(None)
+
+        out = {'questions': qs_copy, 'choices': cs}
+        with open(out_dir, 'wb') as handle:
+            pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/model/run_hf_question_gen.py b/model/run_hf_question_gen.py
@@ -207,7 +207,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use
                              'first_quarter': (0, int(0.25 * len(input_prompts))),
                              'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
                              'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
-                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
+                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
+                             'first_eighth': (0, int(0.125 * len(input_prompts))),
+                             'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
+                             'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
+                             'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
+                             'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
+                             'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
+                             'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
+                             'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
+                             }
             start, end = partition_map[partition]
 
             for i in tqdm.tqdm(range(start, end)):

diff --git a/model/run_hf_question_gen_remote.py b/model/run_hf_question_gen_remote.py
@@ -221,7 +221,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use
                              'first_quarter': (0, int(0.25 * len(input_prompts))),
                              'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
                              'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
-                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
+                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
+                             'first_eighth': (0, int(0.125 * len(input_prompts))),
+                             'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
+                             'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
+                             'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
+                             'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
+                             'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
+                             'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
+                             'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
+                             }
             start, end = partition_map[partition]
 
             for i in tqdm.tqdm(range(start, end)):

diff --git a/model/run_hf_remote.py b/model/run_hf_remote.py
@@ -207,7 +207,16 @@ def run_inference(dataset_names, prompt_types, model_name, partition, use_20_few
                              'first_quarter': (0, int(0.25 * len(input_prompts))),
                              'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
                              'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
-                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
+                             'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
+                             'first_eighth': (0, int(0.125 * len(input_prompts))),
+                             'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
+                             'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
+                             'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
+                             'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
+                             'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
+                             'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
+                             'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
+                             }
             start, end = partition_map[partition]
 
             for i in tqdm.tqdm(range(start, end)):

diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,6 @@ huggingface_hub==0.20.1
 matplotlib==3.7.2
 numpy==1.25.1
 pandas==2.0.3
-plotnine==0.12.3
 scipy==1.12.0
 torch==2.0.1
 tqdm==4.65.0

diff --git a/scripts/model.sh b/scripts/model.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+model_name="llama 70b" # model nickname (for saving in folders)
+model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory
+
+# list of experiments
+# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
+experiments=("normal" "artifact_choices_cot")
+
+# list of datasets to test
+# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
+datasets=("ARC")
+
+# what partition of the dataset to run
+# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
+partition="full" 
+
+hf_token=... # huggingface token (for downloading gated models)
+load_in_8bit="False" # load the model in 8bit? ("False" or "True")
+load_in_4bit="False" # load the model in 4bit? ("False" or "True")
+use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon 
+
+
+
+datasets_str=$(IFS=" "; echo "${datasets[*]}")
+experiments_str=$(IFS=" "; echo "${experiments[*]}")
+
+# add the correct file below
+python3 /mcqa-artifacts/model/run_hf.py \
+--model_name="$model_name" \
+--model_name_hf="$model_name_hf" \
+--dataset_name="$datasets_str" \
+--hf_token="$hf_token" \
+--load_in_4bit="$load_in_4bit" \
+--load_in_8bit="$load_in_8bit" \
+--partition="$partition" \
+--prompt_types="$experiments_str" \
+--use_20_fewshot="$use_20_fewshot"
diff --git a/scripts/model_question_gen.sh b/scripts/model_question_gen.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+model_name="llama 70b" # model nickname (for saving in folders)
+model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory
+
+# list of experiments
+# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
+experiments=("normal" "artifact_choices_cot")
+
+# list of datasets to test
+# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
+datasets=("ARC")
+
+# what partition of the dataset to run
+# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
+partition="full" 
+
+# Should you use a random question ("True") or a model-generated question ("False")
+use_random_question="False"
+
+hf_token=... # huggingface token (for downloading gated models)
+load_in_8bit="False" # load the model in 8bit? ("False" or "True")
+load_in_4bit="False" # load the model in 4bit? ("False" or "True")
+use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon 
+
+
+
+datasets_str=$(IFS=" "; echo "${datasets[*]}")
+experiments_str=$(IFS=" "; echo "${experiments[*]}")
+
+python3 /mcqa-artifacts/model/run_hf_question_gen.py \
+--model_name="$model_name" \
+--model_name_hf="$model_name_hf" \
+--dataset_name="$datasets_str" \
+--hf_token="$hf_token" \
+--load_in_4bit="$load_in_4bit" \
+--load_in_8bit="$load_in_8bit" \
+--partition="$partition" \
+--use_random_question="$use_random_question" \
+--use_20_fewshot="$use_20_fewshot"
diff --git a/scripts/model_question_gen_remote.sh b/scripts/model_question_gen_remote.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+model_name="llama 70b" # model nickname (for saving in folders)
+model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory
+
+# list of experiments
+# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
+experiments=("normal" "artifact_choices_cot")
+
+# list of datasets to test
+# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
+datasets=("ARC")
+
+# what partition of the dataset to run
+# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
+partition="full" 
+
+# Should you use a random question ("True") or a model-generated question ("False")
+use_random_question="False"
+
+hf_token=... # huggingface token (for downloading gated models)
+load_in_8bit="False" # load the model in 8bit? ("False" or "True")
+load_in_4bit="False" # load the model in 4bit? ("False" or "True")
+use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon 
+
+
+
+datasets_str=$(IFS=" "; echo "${datasets[*]}")
+experiments_str=$(IFS=" "; echo "${experiments[*]}")
+
+python3 /mcqa-artifacts/model/run_hf_question_gen_remote.py \
+--model_name="$model_name" \
+--model_name_hf="$model_name_hf" \
+--dataset_name="$datasets_str" \
+--hf_token="$hf_token" \
+--load_in_4bit="$load_in_4bit" \
+--load_in_8bit="$load_in_8bit" \
+--partition="$partition" \
+--use_random_question="$use_random_question" \
+--use_20_fewshot="$use_20_fewshot"