Skip to content

Commit

Permalink
add scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
nbalepur committed Jan 27, 2024
1 parent 77093ec commit 2336328
Show file tree
Hide file tree
Showing 12 changed files with 324 additions and 38 deletions.
1 change: 0 additions & 1 deletion evaluation/plot_accuracy_individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import pandas as pd
from scipy import stats
import numpy as np
import plotnine
import matplotlib.pyplot as plt

sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))
Expand Down
72 changes: 39 additions & 33 deletions model/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,52 @@
from enum import Enum
import numpy as np
from prompt import Normal, MemoriazationNoChoices, MemoriazationRepeatGoldChoices, MemoriazationEmptyChoices, ArtifactChoices, ArtifactChoicesQuestionCOT, TwoChoices, ThreeChoices, ShuffleChoices, ChoiceA, ChoiceB, ChoiceC, ChoiceD, ChoiceAQuestion, ChoiceBQuestion, ChoiceCQuestion, ChoiceDQuestion
import contextlib
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

class PromptType(Enum):
normal = 'normal'
memorization_no_choice = 'memorization_no_choice'
memorization_gold = 'memorization_gold'
memorization_empty = 'memorization_empty'
artifact_choices = 'artifact_choices'
artifact_choices_cot = 'artifact_choices_cot'
two_choices = 'two_choices'
three_choices = 'three_choices'
shuffle_choices = 'shuffle_choices'
choice_a = 'choice_a'
choice_b = 'choice_b'
choice_c = 'choice_c'
choice_d = 'choice_d'
choice_a_question = 'choice_a_question'
choice_b_question = 'choice_b_question'
choice_c_question = 'choice_c_question'
choice_d_question = 'choice_d_question'
choice_a_even = 'choice_a_even'
choice_b_even = 'choice_b_even'
choice_c_even = 'choice_c_even'
choice_d_even = 'choice_d_even'
choice_a_question_even = 'choice_a_question_even'
choice_b_question_even = 'choice_b_question_even'
choice_c_question_even = 'choice_c_question_even'
choice_d_question_even = 'choice_d_question_even'
artifact_choices_cot_twostep = 'artifact_choices_cot_twostep'
artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated'
artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random'
normal = 'normal' # Full MCQA Prompt
artifact_choices = 'artifact_choices' # Choices-Only Prompt

memorization_no_choice = 'memorization_no_choice' # Memorization Prompt - no choices shown
memorization_gold = 'memorization_gold' # Memorization Prompt - all choices are the gold answer
memorization_empty = 'memorization_empty' # Memorization Prompt - all choices are empty

choice_a_even = 'choice_a_even' # Independently classify the correctness of each option A, without the question
choice_b_even = 'choice_b_even' # Independently classify the correctness of each option B, without the question
choice_c_even = 'choice_c_even' # Independently classify the correctness of each option C, without the question
choice_d_even = 'choice_d_even' # Independently classify the correctness of each option D, without the question

choice_a_question_even = 'choice_a_question_even' # Independently classify the correctness of each option A, with the question
choice_b_question_even = 'choice_b_question_even' # Independently classify the correctness of each option B, with the question
choice_c_question_even = 'choice_c_question_even' # Independently classify the correctness of each option C, with the question
choice_d_question_even = 'choice_d_question_even' # Independently classify the correctness of each option D, with the question

artifact_choices_cot = 'artifact_choices_cot' # Step 1 of Inferring the Question
artifact_choices_cot_twostep_generated = 'artifact_choices_cot_twostep_generated' # Step 2 of Inferring the Question
artifact_choices_cot_twostep_random = 'artifact_choices_cot_twostep_random' # Inferring the Question comparison with Random Question

two_choices = 'two_choices' # 2 Choices out of 4 (not in paper)
three_choices = 'three_choices' # 3 Choices out of 4 (not in paper)
shuffle_choices = 'shuffle_choices' # Shuffle the MC choices (not in paper)

choice_a = 'choice_a' # Independently classify the correctness of each option A, without the question (75/25 prior)
choice_b = 'choice_b' # Independently classify the correctness of each option B, without the question (75/25 prior)
choice_c = 'choice_c' # Independently classify the correctness of each option C, without the question (75/25 prior)
choice_d = 'choice_d' # Independently classify the correctness of each option D, without the question (75/25 prior)

choice_a_question = 'choice_a_question_' # Independently classify the correctness of each option A, with the question (75/25 prior)
choice_b_question = 'choice_b_question' # Independently classify the correctness of each option B, with the question (75/25 prior)
choice_c_question = 'choice_c_question' # Independently classify the correctness of each option C, with the question (75/25 prior)
choice_d_question = 'choice_d_question' # Independently classify the correctness of each option D, with the question (75/25 prior)



class DatasetName(Enum):
mmlu = 'mmlu'
HellaSwag = 'HellaSwag'
ARC = 'ARC'
Winogrande = 'Winogrande'
mmlu = 'mmlu' # MMLU
HellaSwag = 'HellaSwag' # HellaSwag
ARC = 'ARC' # ARC
Winogrande = 'Winogrande' # Winogrande (not in paper)

prompt_type_map = {
PromptType.normal: Normal,
Expand Down
39 changes: 39 additions & 0 deletions model/extract_generated_questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pickle
import datasets
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))

# specify models, datasets, and the results directory
res_dir = ...
MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b']
DATASETS = ['ARC', 'MMLU', 'HellaSwag']

pt = 'artifact_choices_cot'

for model_nickname in MODELS:
for dataset in DATASETS:

res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl'
out_dir = f'{res_dir}{dataset}/{model_nickname}/gen_question_data.pkl'
with open(res_dir, 'rb') as handle:
res = pickle.load(handle)

qs = []
cs = []
invalid_count = 0
for i, r in enumerate(res['raw_text']):
p = res['prompt'][i]
if r != None and 'Answer:' in r:
r_ = r[:r.index('Answer:')].strip()
qs.append(r_)
p_ = p.split('\n\n')[-1]
cs.append(p_.replace('Question:', '').strip())
else:
invalid_count += 1
qs.append(None)
cs.append(None)

out = {'questions': qs, 'choices': cs}
with open(out_dir, 'wb') as handle:
pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL)
60 changes: 60 additions & 0 deletions model/extract_random_questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pickle
import datasets
import copy
import random
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'model'))
from data_loader import create_data_evaluation, DatasetName

# specify models, datasets, and the results directory
res_dir = ...
MODELS = ['llama 70b', 'falcon 40b', 'mixtral 7b']
DATASETS = [DatasetName.ARC, DatasetName.HellaSwag, DatasetName.mmlu]


pt = 'artifact_choices_cot'
ds = datasets.load_dataset('nbalepur/mcqa_artifacts')

def check_any_match(l1, l2):
for i in range(len(l1)):
if l1[i] == l2[i]:
return True
return False

for dataset_name in DATASETS:

data = create_data_evaluation(ds, dataset_name)
qs = data['questions']
qs_copy = copy.deepcopy(qs)

while check_any_match(qs, qs_copy):
random.shuffle(qs_copy)

dataset = dataset_name.value

for model_nickname in MODELS:

res_dir = f'{res_dir}{dataset}/{model_nickname}/{pt}.pkl'
out_dir = f'{res_dir}{dataset}/{model_nickname}/random_question_data.pkl'
with open(res_dir, 'rb') as handle:
res = pickle.load(handle)

qs = []
cs = []
invalid_count = 0
for i, r in enumerate(res['raw_text']):
p = res['prompt'][i]
if r != None and 'Answer:' in r:
r_ = r[:r.index('Answer:')].strip()
qs.append(r_)
p_ = p.split('\n\n')[-1]
cs.append(p_.replace('Question:', '').strip())
else:
invalid_count += 1
qs.append(None)
cs.append(None)

out = {'questions': qs_copy, 'choices': cs}
with open(out_dir, 'wb') as handle:
pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL)
11 changes: 10 additions & 1 deletion model/run_hf_question_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use
'first_quarter': (0, int(0.25 * len(input_prompts))),
'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
'first_eighth': (0, int(0.125 * len(input_prompts))),
'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
}
start, end = partition_map[partition]

for i in tqdm.tqdm(range(start, end)):
Expand Down
11 changes: 10 additions & 1 deletion model/run_hf_question_gen_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,16 @@ def run_inference(dataset_names, model_name, partition, use_random_question, use
'first_quarter': (0, int(0.25 * len(input_prompts))),
'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
'first_eighth': (0, int(0.125 * len(input_prompts))),
'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
}
start, end = partition_map[partition]

for i in tqdm.tqdm(range(start, end)):
Expand Down
11 changes: 10 additions & 1 deletion model/run_hf_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,16 @@ def run_inference(dataset_names, prompt_types, model_name, partition, use_20_few
'first_quarter': (0, int(0.25 * len(input_prompts))),
'second_quarter': (int(0.25 * len(input_prompts)), int(0.5 * len(input_prompts))),
'third_quarter': (int(0.5 * len(input_prompts)), int(0.75 * len(input_prompts))),
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts))}
'fourth_quarter': (int(0.75 * len(input_prompts)), len(input_prompts)),
'first_eighth': (0, int(0.125 * len(input_prompts))),
'second_eighth': (int(0.125 * len(input_prompts)), int(2*0.125 * len(input_prompts))),
'third_eighth': (int(2*0.125 * len(input_prompts)), int(3*0.125 * len(input_prompts))),
'fourth_eighth': (int(3*0.125 * len(input_prompts)), int(4*0.125 * len(input_prompts))),
'fifth_eighth': (int(4*0.125 * len(input_prompts)), int(5*0.125 * len(input_prompts))),
'sixth_eighth': (int(5*0.125 * len(input_prompts)), int(6*0.125 * len(input_prompts))),
'seventh_eighth': (int(6*0.125 * len(input_prompts)), int(7*0.125 * len(input_prompts))),
'eighth_eighth': (int(7*0.125 * len(input_prompts)), len(input_prompts)),
}
start, end = partition_map[partition]

for i in tqdm.tqdm(range(start, end)):
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ huggingface_hub==0.20.1
matplotlib==3.7.2
numpy==1.25.1
pandas==2.0.3
plotnine==0.12.3
scipy==1.12.0
torch==2.0.1
tqdm==4.65.0
Expand Down
38 changes: 38 additions & 0 deletions scripts/model.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

model_name="llama 70b" # model nickname (for saving in folders)
model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory

# list of experiments
# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
experiments=("normal" "artifact_choices_cot")

# list of datasets to test
# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
datasets=("ARC")

# what partition of the dataset to run
# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
partition="full"

hf_token=... # huggingface token (for downloading gated models)
load_in_8bit="False" # load the model in 8bit? ("False" or "True")
load_in_4bit="False" # load the model in 4bit? ("False" or "True")
use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon



datasets_str=$(IFS=" "; echo "${datasets[*]}")
experiments_str=$(IFS=" "; echo "${experiments[*]}")

# add the correct file below
python3 /mcqa-artifacts/model/run_hf.py \
--model_name="$model_name" \
--model_name_hf="$model_name_hf" \
--dataset_name="$datasets_str" \
--hf_token="$hf_token" \
--load_in_4bit="$load_in_4bit" \
--load_in_8bit="$load_in_8bit" \
--partition="$partition" \
--prompt_types="$experiments_str" \
--use_20_fewshot="$use_20_fewshot"
40 changes: 40 additions & 0 deletions scripts/model_question_gen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

model_name="llama 70b" # model nickname (for saving in folders)
model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory

# list of experiments
# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
experiments=("normal" "artifact_choices_cot")

# list of datasets to test
# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
datasets=("ARC")

# what partition of the dataset to run
# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
partition="full"

# Should you use a random question ("True") or a model-generated question ("False")
use_random_question="False"

hf_token=... # huggingface token (for downloading gated models)
load_in_8bit="False" # load the model in 8bit? ("False" or "True")
load_in_4bit="False" # load the model in 4bit? ("False" or "True")
use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon



datasets_str=$(IFS=" "; echo "${datasets[*]}")
experiments_str=$(IFS=" "; echo "${experiments[*]}")

python3 /mcqa-artifacts/model/run_hf_question_gen.py \
--model_name="$model_name" \
--model_name_hf="$model_name_hf" \
--dataset_name="$datasets_str" \
--hf_token="$hf_token" \
--load_in_4bit="$load_in_4bit" \
--load_in_8bit="$load_in_8bit" \
--partition="$partition" \
--use_random_question="$use_random_question" \
--use_20_fewshot="$use_20_fewshot"
40 changes: 40 additions & 0 deletions scripts/model_question_gen_remote.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

model_name="llama 70b" # model nickname (for saving in folders)
model_name_hf="meta-llama/Llama-2-70b-hf" # huggingface directory

# list of experiments
# see all possible experiments in: /mcqa-artifacts/model/data_loader.py
experiments=("normal" "artifact_choices_cot")

# list of datasets to test
# see all possible datasets in: /mcqa-artifacts/model/data_loader.py
datasets=("ARC")

# what partition of the dataset to run
# can be "full" or in halves (e.g. "first_half"), quarters (e.g. "first_quarter"), or eigths (e.g. "first_eighth")
partition="full"

# Should you use a random question ("True") or a model-generated question ("False")
use_random_question="False"

hf_token=... # huggingface token (for downloading gated models)
load_in_8bit="False" # load the model in 8bit? ("False" or "True")
load_in_4bit="False" # load the model in 4bit? ("False" or "True")
use_20_fewshot="False" # use a 20-shot prompt in ARC? ("False" or "True") => we set this to "True" for Falcon



datasets_str=$(IFS=" "; echo "${datasets[*]}")
experiments_str=$(IFS=" "; echo "${experiments[*]}")

python3 /mcqa-artifacts/model/run_hf_question_gen_remote.py \
--model_name="$model_name" \
--model_name_hf="$model_name_hf" \
--dataset_name="$datasets_str" \
--hf_token="$hf_token" \
--load_in_4bit="$load_in_4bit" \
--load_in_8bit="$load_in_8bit" \
--partition="$partition" \
--use_random_question="$use_random_question" \
--use_20_fewshot="$use_20_fewshot"
Loading

0 comments on commit 2336328

Please sign in to comment.