-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathconvert_to_desc.py
122 lines (100 loc) · 3.73 KB
/
convert_to_desc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Example Usage:
python evaluation/convert_to_desc.py --data_file <path_to_culturebank_data> --output_file <output_file>
"""
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import re
import json
import argparse
import openai
from openai import OpenAI
from utils.prompt_utils import (
FIELD_DEFINITIONS_SUMMARIZED,
INCONTEXT_DESC,
INCONTEXT_EXP,
DESC_SYSTEM_PROMPT,
DESC_USER_TEMPLATE,
)
from utils.constants import DESC_FIELDS
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_file", type=str)
parser.add_argument("--output_file", type=str)
parser.add_argument("--num_samples", type=int, default=-1)
parser.add_argument("--sanity_check", action=argparse.BooleanOptionalAction)
parser.add_argument("--num_partitions", type=int, default=4)
parser.add_argument("--partition", type=int, default=-1)
args = parser.parse_args()
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()
df = pd.read_csv(args.data_file)
if args.num_samples != -1:
df = df.sample(n=args.num_samples, replace=False, random_state=1234)
if args.partition != -1:
assert args.partition < args.num_partitions
partitions = np.array_split(df, args.num_partitions)
for i in range(len(partitions)):
print(f"partition {i}:")
print(partitions[i].head())
print()
df = partitions[args.partition]
print(f"currently processing {len(df)} clusters")
print(df.head())
model = "gpt-3.5-turbo-1106"
engine = "chatgpt0613"
temperature = 0.3
max_tokens = 1000
top_p = 0.2
seed = 1234
system_message = DESC_SYSTEM_PROMPT.format(
json.dumps(FIELD_DEFINITIONS_SUMMARIZED, indent=4)
)
incontext_user_message = DESC_USER_TEMPLATE.format(
json.dumps(INCONTEXT_EXP, indent=4)
)
incontext_assistant_message = INCONTEXT_DESC
df_results = []
for idx, _ in tqdm(df.iterrows(), total=len(df)):
for _ in range(10):
try:
df_line = df.loc[idx]
cultural_knowledge = {}
for field in DESC_FIELDS:
cultural_knowledge[field] = df_line[field]
user_message = DESC_USER_TEMPLATE.format(
json.dumps(cultural_knowledge, indent=4)
)
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": incontext_user_message},
{"role": "assistant", "content": incontext_assistant_message},
{"role": "user", "content": user_message},
]
response = client.chat.completions.create(
model=model,
messages=messages,
seed=seed,
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
# engine=engine,
)
response_content = response.choices[0].message.content
prompt_tokens = response.usage.prompt_tokens
description = response_content.strip()
output_row = {}
output_row["cluster_id"] = df_line["cluster_id"]
output_row["desc"] = description
df_results.append(output_row)
break
except Exception as e:
print(f"encountered error at row {idx}: {e}")
print("retrying...")
continue
df_results = pd.DataFrame.from_records(df_results, columns=["cluster_id", "desc"])
df_results.to_csv(args.output_file, index=None)
if __name__ == "__main__":
main()