-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathself_debiasing_llama.py
302 lines (249 loc) · 15.3 KB
/
self_debiasing_llama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import argparse
import json
import random
import torch
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
import random
import json
from tqdm import tqdm
from self_debiasing.modeling import LlamaWrapper
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"
DEBIASING_PREFIXES = [
"The following text contains rude, disrespectful, or unreasonable language:\n",
"The following text contains very hateful, aggressive, disrespectful language:\n",
"The following text contains sexually explicit language:\n",
"The following text contains a threat:\n",
"The following text contains negative or hateful language targeting someone because of their identity:\n",
"The following text contains swear words, curse words, or other obscene or profane language:\n"
]
DEBIASING_KEYWORDS = [
"(rude) ", "(sexually explicit) ", "(hateful) ", "(aggressive) ", "(racist) ", "(threat) ", "(violent) ", "(sexist) "
]
def get_random_prompts(dataset, num_examples=100):
assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
picks = random.sample(range(len(dataset)), num_examples)
return(dataset[picks])
class ToxicityMetric:
def __init__(self, num_examples=100):
toxicity_prompts = load_dataset("allenai/real-toxicity-prompts", split="train")
toxic_sample= get_random_prompts(toxicity_prompts, num_examples)
self.toxic_prompts = [p['text'] for p in toxic_sample['prompt']]
self.toxicity = evaluate.load("toxicity")
def compute(self, model, tokenizer, debiasing_prefixes, args, max_length=50):
model_continuations=[]
for prompt in tqdm(self.toxic_prompts, desc='Evaluating Toxicity'):
with torch.no_grad():
outputs = model.generate_self_debiasing(
[prompt], debiasing_prefixes=debiasing_prefixes,
decay_constant=args.decay_constant, epsilon=args.epsilon,
max_length=max_length, do_sample=False, pad_token_id=50256)
continuation = outputs[0]
model_continuations.append(continuation)
print(list(zip(self.toxic_prompts[:10], model_continuations[:10])))
toxicity_ratio = self.toxicity.compute(predictions=model_continuations, aggregation="ratio")
max_toxicity = self.toxicity.compute(predictions=model_continuations, aggregation="maximum")
metrics = {
'bias/toxicity/max_toxicity' : max_toxicity['max_toxicity'],
'bias/toxicity/toxicity_ratio' : toxicity_ratio['toxicity_ratio']
}
return metrics
class BoldMetric:
def __init__(self):
bold = load_dataset("AlexaAI/bold", split="train")
female_bold = [p for p in bold if p['category'] == 'American_actresses']
male_bold = [p for p in bold if p['category'] == 'American_actors']
num_prompts = min(len(male_bold), len(female_bold))
print(f'Num prompts = {num_prompts}')
self.prompts = {}
self.prompts['male'] = [p['prompts'][0] for p in male_bold][:num_prompts]
self.prompts['female'] = [p['prompts'][0] for p in female_bold][:num_prompts]
self.regard = evaluate.load('regard', 'compare')
def compute(self, model, tokenizer, debiasing_prefixes, args, max_length=50):
continuations = {k: [] for k in ['male', 'female']}
for gender, prompts in self.prompts.items():
for prompt in tqdm(prompts, desc=f'Evaluating Bold for {gender}'):
outputs = model.generate_self_debiasing(
[prompt], debiasing_prefixes=debiasing_prefixes,
decay_constant=args.decay_constant, epsilon=args.epsilon,
max_length=max_length, do_sample=False, pad_token_id=50256)
continuation = outputs[0]
continuations[gender].append(continuation)
regard = self.regard.compute(data=continuations['male'], references=continuations['female'])
metrics = {
'bias/bold/regard_difference/positive' : regard['regard_difference']['positive'],
'bias/bold/regard_difference/neutral' : regard['regard_difference']['neutral'],
'bias/bold/regard_difference/other' : regard['regard_difference']['other'],
'bias/bold/regard_difference/negative' : regard['regard_difference']['negative']
}
return metrics
class HonestMetric:
def __init__(self):
honest_dataset_queer = load_dataset("MilaNLProc/honest", 'en_queer_nonqueer', split='honest')
honest_dataset_binary = load_dataset("MilaNLProc/honest", 'en_binary', split='honest')
self.prompts = {}
self.prompts['queer'] = [h['template_masked'].replace(' [M].','') for h in honest_dataset_queer if h['category'].startswith('queer')]
self.prompts['nonqueer'] = [h['template_masked'].replace(' [M].','') for h in honest_dataset_queer if h['category'].startswith('nonqueer')]
self.prompts['male'] = [h['template_masked'].replace(' [M].','') for h in honest_dataset_binary if h['category'].startswith('male')]
self.prompts['female'] = [h['template_masked'].replace(' [M].','') for h in honest_dataset_binary if h['category'].startswith('female')]
self.generation_kwargs = {
# "min_length": -1,
"top_k": 0.0, # Ignored.
"top_p": 1.0, # Ignored.
"do_sample": True, # If False, does greedy sampling.
"eos_token_id": 100_000,
}
self.honest = evaluate.load('honest', 'en')
def compute(self, model, tokenizer, debiasing_prefixes, args, num_generations=20):
continuations = {k: [] for k in ['queer', 'nonqueer', 'male', 'female']}
for gender, prompts in self.prompts.items():
for prompt in tqdm(prompts, desc=f'Evaluating honest for {gender}'):
inputs = tokenizer(prompt, return_tensors="pt")
# inputs = {k: v.to(device) for k, v in inputs.items()}
max_len = inputs["input_ids"].shape[-1] + 10
outputs = model.generate_self_debiasing(
[prompt], debiasing_prefixes=debiasing_prefixes,
decay_constant=args.decay_constant, epsilon=args.epsilon,
pad_token_id=tokenizer.pad_token_id, max_length=max_len,
num_return_sequences=num_generations, **self.generation_kwargs)
continuation = outputs[0]
continuations[gender].append(continuation)
groups = ['queer'] * 50 + ['nonqueer'] * 50 + ['male'] * 50 + ['female'] * 50
continuations = [c.split() for c in continuations['queer']] + [q.split() for q in continuations['nonqueer']] + [c.split() for c in continuations['male']] + [q.split() for q in continuations['female']]
honest_score = self.honest.compute(predictions=continuations, groups=groups)
print(f'honest_score = {honest_score}')
metrics = {
'bias/honest/queer' : honest_score['honest_score_per_group']['queer'],
'bias/honest/nonqueer' : honest_score['honest_score_per_group']['nonqueer'],
'bias/honest/male' : honest_score['honest_score_per_group']['male'],
'bias/honest/female' : honest_score['honest_score_per_group']['female']
}
return metrics
class WinoBiasMetric:
def __init__(self):
data = load_dataset("mathemakitten/winobias_antistereotype_test", split="test")
self.prompts1 = [data[i]['text'] + data[i]["classes"][0] for i in range(len(data))]
self.prompts2 = [data[i]['text'] + data[i]["classes"][1] for i in range(len(data))]
self.prompts = {}
self.prompts['male'] = [data[i]['text'] + ' he ' for i in range(len(data))]
self.prompts['female'] = [data[i]['text'] + ' she ' for i in range(len(data))]
self.accuracy = evaluate.load("accuracy")
self.toxicity = evaluate.load("toxicity")
def eval_loss(self, prompt, model, tokenizer, debiasing_prefixes, args, device):
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
input_ids = inputs['input_ids']
loss = model.compute_loss_self_debiasing(input_ids, input_ids.size(1),
debiasing_prefixes=debiasing_prefixes,
decay_constant=args.decay_constant, epsilon=args.epsilon)
loss = loss.detach().cpu().item()
return loss
def log_probs_from_logits(self, logits, labels):
logp = F.log_softmax(logits, dim=-1)
logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
return logp_label
def sequence_logprob(self, model, labels, input_len=0):
with torch.no_grad():
output = model(labels)
log_probs = self.log_probs_from_logits(output.logits[:,:-1, :], labels[:, 1:])
seq_log_prob = torch.sum(log_probs[:, input_len:])
return seq_log_prob.cpu().item()
def compute(self, model, tokenizer, debiasing_prefixes, args):
device = model._model.device
preds = []
for prompt1, prompt2 in tqdm(zip(self.prompts1, self.prompts2), desc='Evaluating WinoBias'):
loss1 = self.eval_loss(prompt1, model, tokenizer, debiasing_prefixes, args, device)
loss2 = self.eval_loss(prompt2, model, tokenizer, debiasing_prefixes, args, device)
pred = 0 if loss1 < loss2 else 1
preds.append(pred)
labels = [0] * len(preds)
accuracy_score = self.accuracy.compute(references=labels, predictions=preds)
metrics = {
'bias/wino_bias/accuracy' : accuracy_score['accuracy']
}
continuations = {k: [] for k in ['male', 'female']}
for gender, prompts in self.prompts.items():
for prompt in tqdm(prompts, desc=f'Evaluating WinoBias toxicity for {gender}'):
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate_self_debiasing(
[prompt], debiasing_prefixes=debiasing_prefixes,
decay_constant=args.decay_constant, epsilon=args.epsilon,
max_length=50, do_sample=False, pad_token_id=50256)
continuation = outputs[0]
continuations[gender].append(continuation)
for gender in ['male', 'female']:
toxicity_ratio = self.toxicity.compute(predictions=continuations[gender], aggregation="ratio")
max_toxicity = self.toxicity.compute(predictions=continuations[gender], aggregation="maximum")
metrics[f'bias/wino_bias/{gender}/max_toxicity'] = max_toxicity['max_toxicity']
metrics[f'bias/wino_bias/{gender}/toxicity_ratio'] = toxicity_ratio['toxicity_ratio']
return metrics
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--models", type=str, nargs='+', default=['EleutherAI/gpt-neo-125m'],
help="The specific models to run self-debiasing experiments for (e.g., 'gpt2-medium gpt2-large')")
parser.add_argument("--modes", nargs='+', choices=['default', 'debiased'], default=['default', 'debiased'],
help="Whether to perform debiased ('debiased') or regular ('default') generation")
parser.add_argument("--decay_constant", type=float, default=50,
help="Value for the decay constant (lambda in the paper)")
parser.add_argument("--epsilon", type=float, default=0.01,
help="Minimum factor by which each probability is multiplied")
# parser.add_argument("--do_sample", action='store_true',
# help="Whether to perform sampling instead of greedy decoding")
# parser.add_argument("--min_length", type=int, default=20,
# help="Minimum length for the generated text")
# parser.add_argument("--max_length", type=int, default=20,
# help="Maximum length for the generated text")
# parser.add_argument("--top_k", type=int, default=5,
# help="Only for sampling. If set, only the top_k tokens with the highest probabilities are considered.")
# parser.add_argument("--num_beams", type=int, default=3,
# help="The number of beams for beam search")
# parser.add_argument("--num_return_sequences", type=int, default=1,
# help="The number of sequences to return for each prompt")
# parser.add_argument("--num_repeats", type=int, default=1,
# help="The number of sequences to return for each prompt. Increasing this has the same effect as increasing "
# "--num_return_sequences, but the output sequences are generated sequentially instead of generating them in "
# "parallel. This is slower but can prevent OOM errors for large models.")
# parser.add_argument("--not_challenging_only", action='store_true',
# help="If set to true, all prompts are considered (as opposed to considering only 'challenging' promts)")
parser.add_argument("--use_keywords", action='store_true',
help="If set to true, keywords are used instead of full sentences to construct self-debiasing inputs")
parser.add_argument("--seed", type=int, default=42,
help="The seed for initializing the random number generator used for sampling")
# parser.add_argument("--debug", action='store_true',
# help="If set, additional debugging output is printed to stdout")
args = parser.parse_args()
print(f"Parameters: {args}")
random.seed(args.seed)
torch.manual_seed(args.seed)
for model_name in args.models:
wrapper = LlamaWrapper(model_name=model_name)
wrapper._model.eval()
# wrapper._device = 'cpu'
for mode in args.modes:
if model_name == "/atlas2/u/szalouk/llama-models/llama-7b-hf" and mode == "debiased":
print(f'Skipping {model_name}, {mode}')
continue
debiasing_prefixes = (DEBIASING_PREFIXES if not args.use_keywords else DEBIASING_KEYWORDS) if mode == 'debiased' else []
bias_metrics = {}
bias_metrics['toxicity'] = ToxicityMetric(num_examples=1000)
bias_metrics['bold'] = BoldMetric()
bias_metrics['winobias'] = WinoBiasMetric()
bias_metrics['honest'] = HonestMetric()
bias_stats = {}
with torch.no_grad():
# Compute bias/toxicity/fairness metrics
for metric_name, metric in bias_metrics.items():
bias_stat = metric.compute(wrapper, wrapper._tokenizer, debiasing_prefixes, args)
bias_stats.update(bias_stat)
print(bias_stats)
print(f'=== RESULT [{model_name}, {mode}] ===')
print(bias_stats)
output_name = f'{model_name.split("/")[-1]}_{mode}_bias_stats.txt'
with open(output_name, 'w') as file:
file.write(json.dumps(bias_stats)) # use `json.loads` to do the reverse