-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
102 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Dataset Card for Yahoo Questions and Answers | ||
|
||
## Dataset Description | ||
|
||
- **Original:** [HuggingFace](https://huggingface.co/datasets/yahoo_answers_qa) | ||
|
||
## Dataset Summary | ||
|
||
The dataset is a regroupment of Yahoo questions, topics, and best answers. | ||
|
||
## Script | ||
|
||
### What it is | ||
|
||
This script is made to convert the original datasets to the expected Open | ||
Assistant datasets format. | ||
|
||
This script can aggregate multiple best answers for one question to create more | ||
unique entries. | ||
|
||
It can also generate, if activated, a toxicity number between 0-1 using | ||
[Detoxify](https://github.com/unitaryai/detoxify). | ||
|
||
### What it's not | ||
|
||
This script does not automatically upload any datasets to Hugging Face. It can | ||
only create a JSON file to export the newly generated datasets. | ||
|
||
### Usage | ||
|
||
`python yahoo_qa_conversion.py --help` should do the job 😄 | ||
|
||
## Author | ||
|
||
The script to convert the original datasets was created by | ||
[Shadowner](https://github.com/Shadowner). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
datasets | ||
detoxify | ||
tqdm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import argparse | ||
import json | ||
from typing import Dict, List | ||
|
||
from datasets import Dataset, load_dataset | ||
from detoxify import Detoxify | ||
from tqdm import tqdm | ||
|
||
|
||
def map_data_to_new_format(data: List[Dict], extend: bool, to_take: int, detoxify: bool) -> List[Dict]: | ||
new_data = [] | ||
predictor = Detoxify("multilingual") if detoxify else None | ||
for d in tqdm(data, desc="Mapping data to new format"): | ||
new_d = { | ||
"INSTRUCTION": d["question"], | ||
"RESPONSE": d["answer"], | ||
"SOURCE": "Yahoo Q&A", | ||
"METADATA": { | ||
"topic": d["main_category"], | ||
"toxicity": float(predictor.predict(d["question"])["toxicity"]) if detoxify else None, | ||
}, | ||
} | ||
if extend: | ||
range_to_take = ( | ||
range(len(d["nbestanswers"]) - 1) if len(d["nbestanswers"]) - 1 < to_take else range(to_take) | ||
) | ||
for i in range_to_take: | ||
augmented_d = { | ||
"INSTRUCTION": d["question"], | ||
"RESPONSE": d["nbestanswers"][i + 1], | ||
"SOURCE": "Yahoo Q&A", | ||
"METADATA": { | ||
"topic": d["main_category"], | ||
"toxicity": float(predictor.predict(d["question"])["toxicity"]) if detoxify else None, | ||
}, | ||
} | ||
new_data.append(augmented_d) | ||
|
||
new_data.append(new_d) | ||
return new_data | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Map data from the Yahoo Answers Dataset to the Open Assistant Dataset Forma and optionally extend it with the others best answer." | ||
) | ||
parser.add_argument("--extend", action="store_true", help="extend dataset by taking more answer by question") | ||
parser.add_argument("--export-json", action="store_true", help="export the datasets to Json") | ||
parser.add_argument( | ||
"--detoxify", | ||
action="store_true", | ||
help="generate a toxicity score between 0-1 using detoxify multilingual model", | ||
) | ||
parser.add_argument("--extension-number", type=int, default=1, help="number of answer to take in addition") | ||
|
||
args = parser.parse_args() | ||
dataset: Dataset = load_dataset("yahoo_answers_qa", split="train") | ||
|
||
new_data: List[Dict] = map_data_to_new_format(dataset, args.extend, args.extension_number, args.detoxify) | ||
|
||
if args.export_json: | ||
with open("converted.json", "w") as f: | ||
json.dump(new_data, f, indent=4) |