forked from SpongebBob/Finetune-ChatGLM2-6B
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_to_conv_data.py
45 lines (41 loc) · 1.86 KB
/
convert_to_conv_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
import json
import datetime
import os
'''
orig_data: {"instruction": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?", "input": "", "output": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"}
convert: {
"id": xxx,
"conversations":[
{"from": "human", "value": "题目:小明买了一支钢笔,花费了5元,又买了一本书,花费8元,现在他手里还有10元钱,他手上原来有多少钱?"},
{"from": "assistant", "value": "\n令小明手上原来有的钱为X元。根据题目描述,得出以下方程式:\nX - 5 - 8 = 10\n化简可得:\nX = 23\n因此,小明手上原来有23元钱。"},
]
}
'''
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--orig_data",
)
parser.add_argument(
"--write_data",
)
parser.add_argument(
"--dataset_name",
)
args = parser.parse_args()
f_write = open(args.write_data,"w")
with open(args.orig_data) as f:
lines = f.readlines()
num_id = 1
for line in lines:
data = json.loads(line)
conversations = [{"from": "human", "value": data['instruction']+data['input']},{"from": "assistant", "value": data['output']}]
# conversations = [{"from": "human", "value": data['input']},{"from": "assistant", "value": data['target']}]
uniq_id = data['id'] if "id" in data else args.dataset_name+"-"+str(num_id)
item = {"id":uniq_id, "conversations": conversations}
f_write.write(json.dumps(item, ensure_ascii=False)+"\n")
num_id += 1
f_write.close()
if __name__ == "__main__":
main()