-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_evaluation.py
161 lines (117 loc) · 8.84 KB
/
data_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
device = "cuda" # the device to load the model onto
from vllm import LLM,SamplingParams
import json
import random
def load_file(load_path):
with open(load_path, 'r', encoding='utf-8') as f1:
data = json.load(f1)
# print(data[0])
return data
def save_file(data, save_path):
with open(save_path, 'w', encoding='utf-8') as f1:
f1.write(json.dumps(data))
# model_path="/data2/sft/InternData/slz/LLaMA-Factory/models/llama3_sft_base_origin_checkpoint-3200"
model_path="/data2/sft/InternData/slz/Meta-Llama-3-70B-Instruct"
model_name = model_path.split('/')[-1]
save_path = "/data_generation/select_data/apaper_"+model_name
llm = LLM(model=model_path, tensor_parallel_size=8)
tokenizer = llm.get_tokenizer()
def zero_shot_eval_Naturalness(test_data, model_path):
# llm = LLM(model=model_path, tensor_parallel_size=8)
# tokenizer = llm.get_tokenizer()
prompt = []
# for i in range(0, 100):
for i in range(0, len(test_data)):
prompt_i_ = f"""You will be given a conversation between two individuals. You will then be given one potential response for the next turn in the conversation. \nYour task is to rate the responses on one metric. \nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\nEvaluation Crieteria:\nNaturalness (1-3) Is the response naturally written?\n- A score of 1 (bad) means that theresponse is unnatural.\n- A score of 2 (ok) means the response is strange, but not entirely unnatural.\n- A score of 3 (good) means that the response is natural.\n\nEvaluation Steps:\n1. Read the conversation between the two individuals.\n2. Read the potential response for the next turn in the conversation.\n3. Evaluate the response based on its naturalness, using the provided criteria.\n4. Assign a rating score of 1, 2, or 3 based on the evaluation."""
prompt_i = [{"role":"user", "content":prompt_i_}, {"role":"user", "content":f"""Conversation History:\n{test_data[i]["instruction"]}\n\nResponse:\n{test_data[i]["output"]}\n\nEvaluation Form (scores ONLY):\n- Naturalness:"""}]
input = [{"role":"system", "content":"You are a helpful assistant."}] + prompt_i
tmp = tokenizer.apply_chat_template(input, tokenize=False, add_generation_prompt=True)
prompt.append(tmp)
# print(tmp)
print(prompt[0])
print(prompt[1])
print(prompt[2])
print('** len: ', len(prompt))
sampling_params = SamplingParams(temperature=0.8, top_p=0.9, max_tokens=128, stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")])
outputs = llm.generate(prompt, sampling_params)
# for output in outputs:
for j in range(0, len(outputs)):
output = outputs[j]
prompt = output.prompt
response = output.outputs[0].text
# if(response[0] == "\"" and response[-1] == "\""):
# response = response[1:-1]
test_data[j]["Naturalness"] =response
return test_data
def zero_shot_eval_Coherence(test_data, model_path):
# llm = LLM(model=model_path, tensor_parallel_size=8)
# tokenizer = llm.get_tokenizer()
prompt = []
# for i in range(0, 100):
for i in range(0, len(test_data)):
prompt_i_ = f"""You will be given a conversation between two individuals. You will then be given one potential response for the next turn in the conversation. \nYour task is to rate the responses on one metric. \nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing,and refer to it as needed.\n\nEvaluation Crieteria:\nCoherence (1-3) Does the response serve as a valid continuation of the conversation history?\n- A score of 1 (no) means that the response drastically changes topic or ignores the conversation history.\n- A score of 2 (somewhat) means the response refers to the conversation history in a limited capacity (e.g., in a generic way) and shifts the conversation topic.\n- A score of 3 (yes) means the response is on topic and strongly acknowledges the conversation history.\n\nEvaluation Steps:\n1. Read the conversation history.\n2. Read the potential response.\n3. Evaluate the coherence of the response based on the conversation history.\n4. Assign a score of 1, 2, or 3 for coherence."""
prompt_i = [{"role":"user", "content":prompt_i_}, {"role":"user", "content":f"""Conversation History:\n{test_data[i]["instruction"]}\n\nResponse:\n{test_data[i]["output"]}\n\nEvaluation Form (scores ONLY):\n- Coherence:"""}]
input = [{"role":"system", "content":"You are a helpful assistant."}] + prompt_i
tmp = tokenizer.apply_chat_template(input, tokenize=False, add_generation_prompt=True)
prompt.append(tmp)
# print(tmp)
print(prompt[0])
print(prompt[1])
print(prompt[2])
print('** len: ', len(prompt))
sampling_params = SamplingParams(temperature=0.8, top_p=0.9, max_tokens=128, stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")])
outputs = llm.generate(prompt, sampling_params)
# for output in outputs:
for j in range(0, len(outputs)):
output = outputs[j]
prompt = output.prompt
response = output.outputs[0].text
# if(response[0] == "\"" and response[-1] == "\""):
# response = response[1:-1]
test_data[j]["Coherence"] =response
return test_data
def zero_shot_eval_Empathy(test_data, model_path):
# llm = LLM(model=model_path, tensor_parallel_size=8)
# tokenizer = llm.get_tokenizer()
prompt = []
# for i in range(0, 100):
for i in range(0, len(test_data)):
prompt_i_ = f"""You will be given a conversation between two individuals. You will then be given one potential response for the next turn in the conversation. \nYour task is to rate the responses on one metric. \nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing,and refer to it as needed.\n\nEvaluation Crieteria:\nEmpathy (1-3) Does the response demonstrate empathy towards the user's feelings or situation?\n- A score of 1 (bad) means the response lacks empathy and does not acknowledge the user's feelings or situation.\n- A score of 2 (ok) means the response shows some understanding of the user's feelings or situation but could be more empathetic.\n- A score of 3 (good) means the response demonstrates a strong sense of empathy towards the user's feelings or situation and acknowledges them effectively.\n\nEvaluation Steps:\n1. Read the conversation history.\n2. Read the potential response.\n3. Evaluate the empathy of the response based on the conversation history.\n4. Assign a score of 1, 2, or 3 for empathy."""
prompt_i = [{"role":"user", "content":prompt_i_}, {"role":"user", "content":f"""Conversation History:\n{test_data[i]["instruction"]}\n\nResponse:\n{test_data[i]["output"]}\n\nEvaluation Form (scores ONLY):\n- Empathy:"""}]
input = [{"role":"system", "content":"You are a helpful assistant."}] + prompt_i
tmp = tokenizer.apply_chat_template(input, tokenize=False, add_generation_prompt=True)
prompt.append(tmp)
# print(tmp)
print(prompt[0])
print(prompt[1])
print(prompt[2])
print('** len: ', len(prompt))
sampling_params = SamplingParams(temperature=0.8, top_p=0.9, max_tokens=128, stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")])
outputs = llm.generate(prompt, sampling_params)
# for output in outputs:
for j in range(0, len(outputs)):
output = outputs[j]
prompt = output.prompt
response = output.outputs[0].text
# if(response[0] == "\"" and response[-1] == "\""):
# response = response[1:-1]
test_data[j]["Empathy"] =response
return test_data
if __name__ == '__main__':
load_path = "/data_generation/filter_kcentergreeedy_embedscore60_llama3_sft_base_origin_checkpoint-3200_score_expandOriginData.json"
test_data = load_file(load_path)
print('** len: ', len(test_data))
print("save_response_path: ", save_path)
Naturalness_data = zero_shot_eval_Naturalness(test_data, model_path)
save_file(Naturalness_data, save_path + "_Naturalness.json")
print("** done Continuity_Naturalness")
Coherence_data = zero_shot_eval_Coherence(Naturalness_data, model_path)
save_file(Coherence_data, save_path + "_Naturalness_Coherence.json")
print("** done Continuity_Naturalness_Coherence!")
Empathy_data = zero_shot_eval_Empathy(Coherence_data, model_path)
save_file(Empathy_data, save_path + "_Naturalness_Coherence_Empathy.json")
print("** done!")