forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reformat_csv_nan_value.py
85 lines (71 loc) · 2.8 KB
/
reformat_csv_nan_value.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# This tool is used to reformat csv or tsv files which may contain Nan values
# in some field to several jsonl files.
import os
import pathlib
from multiprocessing import Pool
import fire
from datasets import Dataset
def reformat_nan_value(fp, jsonl_fp, keep_default_na, kwargs):
"""
Reformat a csv/tsv file with kwargs.
:param fp: a csv/tsv file
:param jsonl_fp: path to save jsonl file
:param keep_default_na: if False, no string will be parsed as NaN,
otherwise only the default NaN values are used for parsing.
:param kwargs: for tsv file, kwargs["sep'} is `\t`
:return: iterator over files,
"""
ds = Dataset.from_csv(fp, keep_default_na=keep_default_na, **kwargs)
ds.to_json(jsonl_fp, force_ascii=False)
pass
def fp_iter(src_dir, suffix):
"""
Find all files endswith the specified suffix in the source directory.
:param src_dir: path to source dataset directory
:return: iterator over files,
"""
for fp in pathlib.Path(src_dir).glob(f'*{suffix}'):
yield fp
def main(src_dir,
target_dir,
suffixes=['.csv'],
is_tsv=False,
keep_default_na=False,
num_proc=1,
**kwargs):
"""
Reformat csv or tsv files that may contain Nan values using HuggingFace
to load with extra args, e.g. set `keep_default_na` to False
:param src_dir: path thats stores filenames are like "*.csv" or "*.tsv".
:param target_dir: path to store the converted jsonl files.
:param suffixes: files with suffixes to be to process, multi-suffixes args
like `--suffixes "'.tsv', '.csv'"
:param is_tsv: if True, sep will be set to '\t'. Default ','.
:param keep_default_na: if False, no strings will be parsed as NaN,
otherwise only the default NaN values are used for parsing.
:param num_proc: number of process workers, Default 1.
:param kwargs: optional extra args for Dataset loading csv/tsv
"""
# check if the source directory exists
if not os.path.exists(src_dir):
raise ValueError('The raw source data directory does not exist,'
' Please check and retry.')
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
if kwargs is None:
kwargs = {}
if is_tsv:
kwargs['sep'] = '\t'
if isinstance(suffixes, str):
suffixes = [suffixes]
pool = Pool(num_proc)
for suffix in suffixes:
for fp in fp_iter(src_dir, suffix):
jsonl_fp = os.path.join(target_dir,
fp.name.replace(suffix, '.jsonl'))
pool.apply_async(reformat_nan_value,
args=(str(fp), jsonl_fp, keep_default_na, kwargs))
pool.close()
pool.join()
if __name__ == '__main__':
fire.Fire(main)