-
Notifications
You must be signed in to change notification settings - Fork 63
/
dataset_info.yaml
142 lines (122 loc) · 2.93 KB
/
dataset_info.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# The dataset_info.yaml file contains the information of the datasets used in the experiments.
alpaca:
hf_hub_url: tatsu-lab/alpaca
formatting: alpaca
alpaca-clean:
hf_hub_url: yahma/alpaca-cleaned
formatting: alpaca
dolly-15k:
hf_hub_url: databricks/databricks-dolly-15k
formatting: alpaca
guanaco:
hf_hub_url: JosephusCheung/GuanacoDataset
ms_hub_url: AI-ModelScope/GuanacoDataset
formatting: alpaca
openassistant-guanaco:
hf_hub_url: timdettmers/openassistant-guanaco
formatting: alpaca
# Belle Group
belle_0.5m:
hf_hub_url: BelleGroup/train_0.5M_CN
ms_hub_url: AI-ModelScope/train_0.5M_CN
formatting: alpaca
belle_1m:
hf_hub_url: BelleGroup/train_1M_CN
ms_hub_url: AI-ModelScope/train_1M_CN
formatting: alpaca
belle_2m:
hf_hub_url: BelleGroup/train_2M_CN
ms_hub_url: AI-ModelScope/train_2M_CN
formatting: alpaca
belle_dialog:
hf_hub_url: BelleGroup/generated_chat_0.4M
ms_hub_url: AI-ModelScope/generated_chat_0.4M
formatting: alpaca
belle_math:
hf_hub_url: BelleGroup/school_math_0.25M
ms_hub_url: AI-ModelScope/school_math_0.25M
formatting: alpaca
belle_multiturn:
hf_hub_url: BelleGroup/multi_turn_0.5M
formatting: sharegpt
columns:
prompt: instruction
response: output
history: history
# firefly
firefly:
hf_hub_url: YeungNLP/firefly-train-1.1M
formatting: alpaca
columns:
prompt: input
response: target
# CodeAlpaca
codealpaca:
hf_hub_url: sahil2801/CodeAlpaca-20k
ms_hub_url: AI-ModelScope/CodeAlpaca-20k
formatting: alpaca
# alpacacot
alpaca_cot:
hf_hub_url: QingyiSi/Alpaca-CoT
ms_hub_url: AI-ModelScope/Alpaca-CoT
webqa:
hf_hub_url: suolyer/webqa
ms_hub_url: AI-ModelScope/webqa
formatting: alpaca
columns:
prompt: input
response: output
# mutli-turn datasets
evol_instruct:
hf_hub_url: MaziyarPanahi/WizardLM_evol_instruct_V2_196k
ms_hub_url: AI-ModelScope/WizardLM_evol_instruct_V2_196k
formatting: sharegpt
ultrachat_200k:
hf_hub_url: HuggingFaceH4/ultrachat_200k
ms_hub_url: AI-ModelScope/ultrachat_200k
formatting: sharegpt
columns:
messages: messages
tags:
role_tag: role
content_tag: content
user_tag: user
assistant_tag: assistant
lmsys_chat:
hf_hub_url: lmsys/lmsys-chat-1m
ms_hub_url: AI-ModelScope/lmsys-chat-1m
formatting: sharegpt
columns:
messages: conversation
tags:
role_tag: role
content_tag: content
user_tag: human
assistant_tag: assistant
hh_rlhf_en:
script_url: hh_rlhf_en
ranking: true
columns:
prompt: instruction
chosen: chosen
rejected: rejected
history: history
orca_pairs:
hf_hub_url: Intel/orca_dpo_pairs
ranking: true
columns:
prompt: question
chosen: chosen
rejected: rejected
system: system
kto_mix_en:
hf_hub_url: argilla/kto-mix-15k
formatting: sharegpt
columns:
messages: completion
kto_tag: label
tags:
role_tag: role
content_tag: content
user_tag: user
assistant_tag: assistant