diff --git a/evaluation/plot_accuracy_individual.py b/evaluation/plot_accuracy_individual.py index 3b9a821..5fb516f 100644 --- a/evaluation/plot_accuracy_individual.py +++ b/evaluation/plot_accuracy_individual.py @@ -19,7 +19,6 @@ import pandas as pd from scipy import stats import numpy as np -import plotnine import matplotlib.pyplot as plt sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model')) diff --git a/model/data_loader.py b/model/data_loader.py index 4c3827f..8442beb 100644 --- a/model/data_loader.py +++ b/model/data_loader.py @@ -1,46 +1,52 @@ from enum import Enum import numpy as np from prompt import Normal, MemoriazationNoChoices, MemoriazationRepeatGoldChoices, MemoriazationEmptyChoices, ArtifactChoices, ArtifactChoicesQuestionCOT, TwoChoices, ThreeChoices, ShuffleChoices, ChoiceA, ChoiceB, ChoiceC, ChoiceD, ChoiceAQuestion, ChoiceBQuestion, ChoiceCQuestion, ChoiceDQuestion -import contextlib from datasets.utils.logging import disable_progress_bar disable_progress_bar() class PromptType(Enum): - normal = 'normal' - memorization_no_choice = 'memorization_no_choice' - memorization_gold = 'memorization_gold' - memorization_empty = 'memorization_empty' - artifact_choices = 'artifact_choices' - artifact_choices_cot = 'artifact_choices_cot' - two_choices = 'two_choices' - three_choices = 'three_choices' - shuffle_choices = 'shuffle_choices' - choice_a = 'choice_a' - choice_b = 'choice_b' - choice_c = 'choice_c' - choice_d = 'choice_d' - choice_a_question = 'choice_a_question' - choice_b_question = 'choice_b_question' - choice_c_question = 'choice_c_question' - choice_d_question = 'choice_d_question' - choice_a_even = 'choice_a_even' - choice_b_even = 'choice_b_even' - choice_c_even = 'choice_c_even' - choice_d_even = 'choice_d_even' - choice_a_question_even = 'choice_a_question_even' - choice_b_question_even = 'choice_b_question_even' - choice_c_question_even = 'choice_c_question_even' - choice_d_question_even = 'choice_d_question_even' - artifact_choices_cot_twostep = 'artifact_choices_cot_twostep' - artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated' - artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random' + normal = 'normal' # Full MCQA Prompt + artifact_choices = 'artifact_choices' # Choices-Only Prompt + + memorization_no_choice = 'memorization_no_choice' # Memorization Prompt - no choices shown + memorization_gold = 'memorization_gold' # Memorization Prompt - all choices are the gold answer + memorization_empty = 'memorization_empty' # Memorization Prompt - all choices are empty + + choice_a_even = 'choice_a_even' # Independently classify the correctness of each option A, without the question + choice_b_even = 'choice_b_even' # Independently classify the correctness of each option B, without the question + choice_c_even = 'choice_c_even' # Independently classify the correctness of each option C, without the question + choice_d_even = 'choice_d_even' # Independently classify the correctness of each option D, without the question + + choice_a_question_even = 'choice_a_question_even' # Independently classify the correctness of each option A, with the question + choice_b_question_even = 'choice_b_question_even' # Independently classify the correctness of each option B, with the question + choice_c_question_even = 'choice_c_question_even' # Independently classify the correctness of each option C, with the question + choice_d_question_even = 'choice_d_question_even' # Independently classify the correctness of each option D, with the question + + artifact_choices_cot = 'artifact_choices_cot' # Step 1 of Inferring the Question + artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated' # Step 2 of Inferring the Question + artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random' # Inferring the Question comparison with Random Question + + two_choices = 'two_choices' # 2 Choices out of 4 (not in paper) + three_choices = 'three_choices' # 3 Choices out of 4 (not in paper) + shuffle_choices = 'shuffle_choices' # Shuffle the MC choices (not in paper) + + choice_a = 'choice_a' # Independently classify the correctness of each option A, without the question (75/25 prior) + choice_b = 'choice_b' # Independently classify the correctness of each option B, without the question (75/25 prior) + choice_c = 'choice_c' # Independently classify the correctness of each option C, without the question (75/25 prior) + choice_d = 'choice_d' # Independently classify the correctness of each option D, without the question (75/25 prior) + + choice_a_question = 'choice_a_question_' # Independently classify the correctness of each option A, with the question (75/25 prior) + choice_b_question = 'choice_b_question' # Independently classify the correctness of each option B, with the question (75/25 prior) + choice_c_question = 'choice_c_question' # Independently classify the correctness of each option C, with the question (75/25 prior) + choice_d_question = 'choice_d_question' # Independently classify the correctness of each option D, with the question (75/25 prior) + class DatasetName(Enum): - mmlu = 'mmlu' - HellaSwag = 'HellaSwag' - ARC = 'ARC' - Winogrande = 'Winogrande' + mmlu = 'mmlu' # MMLU + HellaSwag = 'HellaSwag' # HellaSwag + ARC = 'ARC' # ARC + Winogrande = 'Winogrande' # Winogrande (not in paper) prompt_type_map = { PromptType.normal: Normal, diff --git a/model/extract_generated_questions.py b/model/extract_generated_questions.py new file mode 100644 index 0000000..db528e6 --- /dev/null +++ b/model/extract_generated_questions.py @@ -0,0 +1,39 @@ +import pickle +import datasets +import sys +import os +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model')) + +# specify models, datasets, and the results directory +res_dir = ... +MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b'] +DATASETS = ['ARC', 'MMLU', 'HellaSwag'] + +pt = 'artifact_choices_cot' + +for model_nickname in MODELS: + for dataset in DATASETS: + + res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl' + out_dir = f'{res_dir}{dataset}/{model_nickname}/gen_question_data.pkl' + with open(res_dir, 'rb') as handle: + res = pickle.load(handle) + + qs = [] + cs = [] + invalid_count = 0 + for i, r in enumerate(res['raw_text']): + p = res['prompt'][i] + if r != None and 'Answer:' in r: + r_ = r[:r.index('Answer:')].strip() + qs.append(r_) + p_ = p.split('\n\n')[-1] + cs.append(p_.replace('Question:', '').strip()) + else: + invalid_count += 1 + qs.append(None) + cs.append(None) + + out = {'questions': qs, 'choices': cs} + with open(out_dir, 'wb') as handle: + pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/model/extract_random_questions.py b/model/extract_random_questions.py new file mode 100644 index 0000000..f8bf6c8 --- /dev/null +++ b/model/extract_random_questions.py @@ -0,0 +1,60 @@ +import pickle +import datasets +import copy +import random +import sys +import os +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model')) +from data_loader import create_data_evaluation, DatasetName + +# specify models, datasets, and the results directory +res_dir = ... +MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b'] +DATASETS = [DatasetName.ARC, DatasetName.HellaSwag, DatasetName.mmlu] + + +pt = 'artifact_choices_cot' +ds = datasets.load_dataset('nbalepur/mcqa_artifacts') + +def check_any_match(l1, l2): + for i in range(len(l1)): + if l1[i] == l2[i]: + return True + return False + +for dataset_name in DATASETS: + + data = create_data_evaluation(ds, dataset_name) + qs = data['questions'] + qs_copy = copy.deepcopy(qs) + + while check_any_match(qs, qs_copy): + random.shuffle(qs_copy) + + dataset = dataset_name.value + + for model_nickname in MODELS: + + res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl' + out_dir = f'{res_dir}{dataset}/{model_nickname}/random_question_data.pkl' + with open(res_dir, 'rb') as handle: + res = pickle.load(handle) + + qs = [] + cs = [] + invalid_count = 0 + for i, r in enumerate(res['raw_text']): + p = res['prompt'][i] + if r != None and 'Answer:' in r: + r_ = r[:r.index('Answer:')].strip() + qs.append(r_) + p_ = p.split('\n\n')[-1] + cs.append(p_.replace('Question:', '').strip()) + else: + invalid_count += 1 + qs.append(None) + cs.append(None) + + out = {'questions': qs_copy, 'choices': cs} + with open(out_dir, 'wb') as handle: + pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL) \ No newline at end of file diff --git a/model/run_hf_question_gen.py b/model/run_hf_question_gen.py index 8c3a0bb..8e9f337 100644 --- a/model/run_hf_question_gen.py +++ b/model/run_hf_question_gen.py @@ -207,7 +207,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use 'first_quarter': (0, int(0.25 * len(input_prompts))), 'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))), 'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))), - 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))} + 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)), + 'first_eighth': (0, int(0.125 * len(input_prompts))), + 'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))), + 'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))), + 'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))), + 'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))), + 'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))), + 'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))), + 'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)), + } start, end = partition_map[partition] for i in tqdm.tqdm(range(start, end)): diff --git a/model/run_hf_question_gen_remote.py b/model/run_hf_question_gen_remote.py index 84cb4b3..c1675b6 100644 --- a/model/run_hf_question_gen_remote.py +++ b/model/run_hf_question_gen_remote.py @@ -221,7 +221,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use 'first_quarter': (0, int(0.25 * len(input_prompts))), 'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))), 'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))), - 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))} + 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)), + 'first_eighth': (0, int(0.125 * len(input_prompts))), + 'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))), + 'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))), + 'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))), + 'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))), + 'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))), + 'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))), + 'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)), + } start, end = partition_map[partition] for i in tqdm.tqdm(range(start, end)): diff --git a/model/run_hf_remote.py b/model/run_hf_remote.py index b3883ed..c29bcc1 100644 --- a/model/run_hf_remote.py +++ b/model/run_hf_remote.py @@ -207,7 +207,16 @@ def run_inference(dataset_names, prompt_types, model_name, partition, use_20_few 'first_quarter': (0, int(0.25 * len(input_prompts))), 'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))), 'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))), - 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))} + 'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)), + 'first_eighth': (0, int(0.125 * len(input_prompts))), + 'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))), + 'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))), + 'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))), + 'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))), + 'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))), + 'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))), + 'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)), + } start, end = partition_map[partition] for i in tqdm.tqdm(range(start, end)): diff --git a/requirements.txt b/requirements.txt index 26e86e5..d72b104 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ huggingface_hub==0.20.1 matplotlib==3.7.2 numpy==1.25.1 pandas==2.0.3 -plotnine==0.12.3 scipy==1.12.0 torch==2.0.1 tqdm==4.65.0 diff --git a/scripts/model.sh b/scripts/model.sh new file mode 100644 index 0000000..2923186 --- /dev/null +++ b/scripts/model.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +model_name="llama 70b" # model nickname (for saving in folders) +model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory + +# list of experiments +# see all possible experiments in: /mcqa-artifacts/model/data_loader.py +experiments=("normal" "artifact_choices_cot") + +# list of datasets to test +# see all possible datasets in: /mcqa-artifacts/model/data_loader.py +datasets=("ARC") + +# what partition of the dataset to run +# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth") +partition="full" + +hf_token=... # huggingface token (for downloading gated models) +load_in_8bit="False" # load the model in 8bit? ("False" or "True") +load_in_4bit="False" # load the model in 4bit? ("False" or "True") +use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon + + + +datasets_str=$(IFS=" "; echo "${datasets[*]}") +experiments_str=$(IFS=" "; echo "${experiments[*]}") + +# add the correct file below +python3 /mcqa-artifacts/model/run_hf.py \ +--model_name="$model_name" \ +--model_name_hf="$model_name_hf" \ +--dataset_name="$datasets_str" \ +--hf_token="$hf_token" \ +--load_in_4bit="$load_in_4bit" \ +--load_in_8bit="$load_in_8bit" \ +--partition="$partition" \ +--prompt_types="$experiments_str" \ +--use_20_fewshot="$use_20_fewshot" diff --git a/scripts/model_question_gen.sh b/scripts/model_question_gen.sh new file mode 100644 index 0000000..ce9f0fa --- /dev/null +++ b/scripts/model_question_gen.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +model_name="llama 70b" # model nickname (for saving in folders) +model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory + +# list of experiments +# see all possible experiments in: /mcqa-artifacts/model/data_loader.py +experiments=("normal" "artifact_choices_cot") + +# list of datasets to test +# see all possible datasets in: /mcqa-artifacts/model/data_loader.py +datasets=("ARC") + +# what partition of the dataset to run +# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth") +partition="full" + +# Should you use a random question ("True") or a model-generated question ("False") +use_random_question="False" + +hf_token=... # huggingface token (for downloading gated models) +load_in_8bit="False" # load the model in 8bit? ("False" or "True") +load_in_4bit="False" # load the model in 4bit? ("False" or "True") +use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon + + + +datasets_str=$(IFS=" "; echo "${datasets[*]}") +experiments_str=$(IFS=" "; echo "${experiments[*]}") + +python3 /mcqa-artifacts/model/run_hf_question_gen.py \ +--model_name="$model_name" \ +--model_name_hf="$model_name_hf" \ +--dataset_name="$datasets_str" \ +--hf_token="$hf_token" \ +--load_in_4bit="$load_in_4bit" \ +--load_in_8bit="$load_in_8bit" \ +--partition="$partition" \ +--use_random_question="$use_random_question" \ +--use_20_fewshot="$use_20_fewshot" diff --git a/scripts/model_question_gen_remote.sh b/scripts/model_question_gen_remote.sh new file mode 100644 index 0000000..5aeeb48 --- /dev/null +++ b/scripts/model_question_gen_remote.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +model_name="llama 70b" # model nickname (for saving in folders) +model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory + +# list of experiments +# see all possible experiments in: /mcqa-artifacts/model/data_loader.py +experiments=("normal" "artifact_choices_cot") + +# list of datasets to test +# see all possible datasets in: /mcqa-artifacts/model/data_loader.py +datasets=("ARC") + +# what partition of the dataset to run +# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth") +partition="full" + +# Should you use a random question ("True") or a model-generated question ("False") +use_random_question="False" + +hf_token=... # huggingface token (for downloading gated models) +load_in_8bit="False" # load the model in 8bit? ("False" or "True") +load_in_4bit="False" # load the model in 4bit? ("False" or "True") +use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon + + + +datasets_str=$(IFS=" "; echo "${datasets[*]}") +experiments_str=$(IFS=" "; echo "${experiments[*]}") + +python3 /mcqa-artifacts/model/run_hf_question_gen_remote.py \ +--model_name="$model_name" \ +--model_name_hf="$model_name_hf" \ +--dataset_name="$datasets_str" \ +--hf_token="$hf_token" \ +--load_in_4bit="$load_in_4bit" \ +--load_in_8bit="$load_in_8bit" \ +--partition="$partition" \ +--use_random_question="$use_random_question" \ +--use_20_fewshot="$use_20_fewshot" diff --git a/scripts/model_remote.sh b/scripts/model_remote.sh new file mode 100644 index 0000000..5c07261 --- /dev/null +++ b/scripts/model_remote.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +model_name="phi 2" # model nickname (for saving in folders) +model_name_hf="microsoft/phi-2" # huggingface directory + +# list of experiments +# see all possible experiments in: /mcqa-artifacts/model/data_loader.py +experiments=("normal" "artifact_choices_cot") + +# list of datasets to test +# see all possible datasets in: /mcqa-artifacts/model/data_loader.py +datasets=("ARC") + +# what partition of the dataset to run +# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth") +partition="full" + +hf_token=... # huggingface token (for downloading gated models) +load_in_8bit="False" # load the model in 8bit? ("False" or "True") +load_in_4bit="False" # load the model in 4bit? ("False" or "True") +use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon + + + +datasets_str=$(IFS=" "; echo "${datasets[*]}") +experiments_str=$(IFS=" "; echo "${experiments[*]}") + +# add the correct file below +python3 /mcqa-artifacts/model/run_hf_remote.py \ +--model_name="$model_name" \ +--model_name_hf="$model_name_hf" \ +--dataset_name="$datasets_str" \ +--hf_token="$hf_token" \ +--load_in_4bit="$load_in_4bit" \ +--load_in_8bit="$load_in_8bit" \ +--partition="$partition" \ +--prompt_types="$experiments_str" \ +--use_20_fewshot="$use_20_fewshot" \ No newline at end of file