data/dataset_info.yaml

# The dataset_info.yaml file contains the information of the datasets used in the experiments.
alpaca:
  hf_hub_url: tatsu-lab/alpaca
  formatting: alpaca

alpaca-clean:
  hf_hub_url: yahma/alpaca-cleaned
  formatting: alpaca

dolly-15k:
  hf_hub_url: databricks/databricks-dolly-15k
  formatting: alpaca

guanaco:
  hf_hub_url: JosephusCheung/GuanacoDataset
  ms_hub_url: AI-ModelScope/GuanacoDataset
  formatting: alpaca

openassistant-guanaco:
  hf_hub_url: timdettmers/openassistant-guanaco
  formatting: alpaca

# Belle Group
belle_0.5m:
  hf_hub_url: BelleGroup/train_0.5M_CN
  ms_hub_url: AI-ModelScope/train_0.5M_CN
  formatting: alpaca

belle_1m:
  hf_hub_url: BelleGroup/train_1M_CN
  ms_hub_url: AI-ModelScope/train_1M_CN
  formatting: alpaca

belle_2m:
  hf_hub_url: BelleGroup/train_2M_CN
  ms_hub_url: AI-ModelScope/train_2M_CN
  formatting: alpaca

belle_dialog:
  hf_hub_url: BelleGroup/generated_chat_0.4M
  ms_hub_url: AI-ModelScope/generated_chat_0.4M
  formatting: alpaca

belle_math:
  hf_hub_url: BelleGroup/school_math_0.25M
  ms_hub_url: AI-ModelScope/school_math_0.25M
  formatting: alpaca

belle_multiturn:
  hf_hub_url: BelleGroup/multi_turn_0.5M
  formatting: sharegpt
  columns:
    prompt: instruction
    response: output
    history: history

# firefly
firefly:
  hf_hub_url: YeungNLP/firefly-train-1.1M
  formatting: alpaca
  columns:
    prompt: input
    response: target

# CodeAlpaca
codealpaca:
  hf_hub_url: sahil2801/CodeAlpaca-20k
  ms_hub_url: AI-ModelScope/CodeAlpaca-20k
  formatting: alpaca

# alpacacot
alpaca_cot:
  hf_hub_url: QingyiSi/Alpaca-CoT
  ms_hub_url: AI-ModelScope/Alpaca-CoT

webqa:
  hf_hub_url: suolyer/webqa
  ms_hub_url: AI-ModelScope/webqa
  formatting: alpaca
  columns:
    prompt: input
    response: output

# mutli-turn datasets
evol_instruct:
  hf_hub_url: MaziyarPanahi/WizardLM_evol_instruct_V2_196k
  ms_hub_url: AI-ModelScope/WizardLM_evol_instruct_V2_196k
  formatting: sharegpt

ultrachat_200k:
  hf_hub_url: HuggingFaceH4/ultrachat_200k
  ms_hub_url: AI-ModelScope/ultrachat_200k
  formatting: sharegpt
  columns:
    messages: messages
  tags:
    role_tag: role
    content_tag: content
    user_tag: user
    assistant_tag: assistant

lmsys_chat:
  hf_hub_url: lmsys/lmsys-chat-1m
  ms_hub_url: AI-ModelScope/lmsys-chat-1m
  formatting: sharegpt
  columns:
    messages: conversation
  tags:
    role_tag: role
    content_tag: content
    user_tag: human
    assistant_tag: assistant

hh_rlhf_en:
  script_url: hh_rlhf_en
  ranking: true
  columns:
    prompt: instruction
    chosen: chosen
    rejected: rejected
    history: history

orca_pairs:
  hf_hub_url: Intel/orca_dpo_pairs
  ranking: true
  columns:
    prompt: question
    chosen: chosen
    rejected: rejected
    system: system

kto_mix_en:
  hf_hub_url: argilla/kto-mix-15k
  formatting: sharegpt
  columns:
    messages: completion
    kto_tag: label
  tags:
    role_tag: role
    content_tag: content
    user_tag: user
    assistant_tag: assistant