-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclue_plus_reasoning.py
180 lines (153 loc) · 6.19 KB
/
clue_plus_reasoning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
k-shot learning on how to classify thuthfuil adn deceptive statements
based on hand-written (now llm generated) clues and reasoning
"""
"""
Inspiraton from:
Text Classification via Large Language Models
Xiaofei Sun, Xiaoya Li, Jiwei Li, Fei Wu, Shangwei Guo, Tianwei Zhang, Guoyin Wang
In this paper, we introduce Clue And Reasoning Prompting (CARP).
CARP adopts a progressive reasoning strategy tailored to addressing
the complex linguistic phenomena involved in text classification:
CARP first prompts LLMs to find superficial clues
(e.g., keywords, tones, semantic relations, references, etc),
based on which a diagnostic reasoning process is induced for final decisions.
input: <demo-text-1>
clues: <demo-clues-1>
reasoning: <demo-reason-1>
sentiment: <demo-label-word-1>
input: <demo-text-2>
clues: <demo-clues-2>
reasoning: <demo-reason-2>
sentiment: <demo-label-word-2>
... ...
input: <demo-text-n>
clues: <demo-clues-n>
reasoning: <demo-reason-n>
sentiment: <demo-label-word-n>
input: <text>
clues:
reasoning:
sentiment:
"""
from openai_interface import init_openai, get_chat_completion_with_backoff
from parse_output import extract_classification
import json
from token_count import nb_tokens_in_prompt
init_openai()
import pandas as pd
# df = pd.read_csv ('dataset/dataset_plus_filtered_liwc.csv')
df = pd.read_csv ('dataset/llm_generated_clues_reasoning_events_data_statements.csv')
# simple EDA
# print(df)
# print(df.columns)
print(f'shape: {df.shape}') # should be 1640 x 6
# print(df.head)
count = 0
for index, row in df.iterrows():
if row['contains_clues'] == 1:
count += 1
print(f"[{row['id'] + 1}], {row['clues']}, {row['reasoning']}")
print(f'Count: {count}')
newline = '\n'
reasoning_word_limit = 100
# print the prelude
prelude = f"""
This is an overall classifier for truthful and deceptive statements.
First, present CLUES (i.e., keywords, phrases, contextual information, semantic relations, semantic meaning,
tones, references) that support the classification determination of the input.
Second, deduce a diagnostic REASONING process from premises (i.e., clues, input) that supports the classification
determination (Limit the number of words to {reasoning_word_limit}).
Third, determine the overall CLASSIFICATION of INPUT as Truthful or Deceptive considering CLUES, the REASONING
process and the INPUT.
"""
def create_prelude():
return prelude + newline
def create_input(row):
"""
For now, combine question 1 and question 2 into a single input
"""
return 'INPUT: ' + row['q1'] + '\n' + row['q2']
def create_clues(row):
"""
use the style from the paper
"""
# return json.dumps({'CLUES' : json.dumps(row['clues'])})
return json.dumps(json.loads(row['clues']))
def create_reasoning(row):
return json.dumps(json.loads(row['reasoning']))
def create_classification(row):
# return dict({'CLASSIFICATION': 'truthful' if row['outcome_class'] == 't' else 'deceptive'})
return json.dumps(json.loads(row['classification']))
def create_one_of_k_shots(row):
k_shot = ''
k_shot += create_input(row) + newline
k_shot += str(create_clues(row)) + newline
k_shot += str(create_reasoning(row)) + newline
k_shot += str(create_classification(row)) + newline
return k_shot
def setup_inference(row):
""" we leave CLUEs, REASONING, and CLASSIFICATION blank for the LLM to generate"""
inference = ''
inference += create_input(row) + newline
inference += str(json.dumps(json.loads('{"CLUES" : ""}'))) + newline
inference += str(json.dumps(json.loads('{"REASONING": ""}'))) + newline
inference += str(json.dumps(json.loads('{"classification": ""}'))) + newline
return inference
def construct_context(inference_row, k_shot_count):
""" constructs the k-shots and the extra shot that needs to be inferred
The k is determined by the has_clues flag, this scheme will change in the future...
Ensure k is even, since we pick up k//2 truthful and k//2 deceptive shots for balance
"""
context = ''
# print(create_prelude())
context += create_prelude()
# Go thru each category separately
count = 0
for index, row in df.iterrows():
if row['contains_clues'] == 1 and row['outcome_class'] == 't':
# print(row['clues'], row['reasoning'])
# create_one_of_k_shots(row)
context += create_one_of_k_shots(row)
count += 1
# print('\n')
context += newline
if count == k_shot_count // 2:
break
count = 0
for index, row in df.iterrows():
if row['contains_clues'] == 1 and row['outcome_class'] == 'd':
# print(row['clues'], row['reasoning'])
# create_one_of_k_shots(row)
context += create_one_of_k_shots(row)
count += 1
# print('\n')
context += newline
if count == k_shot_count // 2:
break
context += setup_inference(df.loc[inference_row].copy())
return context
if __name__ == "__main__":
k_shot_count = 10 # note: 'k // 2' are from each category
ground_truths = []
predictions = []
start_row, end_row = 901, 902
model = 'gpt-4' # "gpt-3.5-turbo" or "gpt-4"
print(f'start row: {start_row} end row: {end_row}')
print(f'Model:{model}')
for row in range(start_row, end_row):
final_context = construct_context(row, k_shot_count)
ground_truth = 'truthful' if df.loc[row]['outcome_class'] == 't' else 'deceptive'
print(f"{20*'-'}row: {row} context GT: {ground_truth} {20*'-'}")
print(final_context)
print(f"Prompt tokens counted by tiktoken: {nb_tokens_in_prompt(final_context, model)}.")
response = get_chat_completion_with_backoff(final_context, model=model)
print(f"{20*'>'}{row} response GT: {ground_truth} {20*'>'}")
print(response + newline)
predicted_class = extract_classification(response)
ground_truths.append(ground_truth)
predictions.append(predicted_class)
for ground_truth, prediction in zip(ground_truths, predictions):
print(f'GT: {ground_truth}, Pred: {prediction}')
from sklearn.metrics import f1_score
print('Weighted F1-score:', f1_score(ground_truths, predictions, average='weighted'))