forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
redpajama-code.yaml
32 lines (29 loc) · 1.32 KB
/
redpajama-code.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Process config example for codes used in RedPajam
# global parameters
project_name: 'RedPajam-codes'
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
np: 4 # number of subprocess to process your dataset
export_path: '/path/to/result/dataset.jsonl'
# process schedule
# a list of several process operators with their arguments
process:
- document_deduplicator:
- clean_copyright_mapper:
- maximum_line_length_filter:
min_len: 1
max_len: 1000
- average_line_length_filter:
min_len: 1
max_len: 100
- alphanumeric_filter:
min_ratio: 0.25
max_ratio: 1.0
- alphanumeric_filter:
tokenization: True
min_ratio: 1.5
- suffix_filter:
suffixes: [".asm", ".bat", ".cmd", ".c", ".h", ".cs", ".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H", ".cmake", ".css",
".dockerfile", ".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp", ".go", ".hs", ".html", ".java", ".js",
".jl", ".lua", ".md", ".markdown", ".php", ".php3", ".php4", ".php5", ".phps", ".phpt", ".pl", ".pm", ".pod", ".perl",
".ps1", ".psd1", ".psm1", ".py", ".rb", ".rs", ".sql", ".scala", ".sh", ".bash", ".command", ".zsh", ".ts", ".tsx",
".tex", ".vb", "Dockerfile", "Makefile", ".xml", ".rst", ".m", ".smali"]