forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
llava-pretrain-refine.yaml
60 lines (54 loc) · 6 KB
/
llava-pretrain-refine.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
project_name: 'llava-1.5-pretrain-dataset-refine-recipe'
dataset_path: 'blip_laion_cc_sbu_558k_dj_fmt_only_caption.jsonl' # converted LLaVA pretrain dataset in Data-Juicer format with only_keep_caption is True. See tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py
export_path: 'blip_laion_cc_sbu_558k_dj_fmt_only_caption_refined.jsonl'
np: 42 # number of subprocess to process your dataset
text_keys: 'text' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
# for multimodal data processing
image_key: 'images' # Key name of field to store the list of sample image paths.
image_special_token: '<image>' # The special token that represents an image in the text. For LLaVA, it's "<image>". Should be aligned with the args when running conversion tools.
eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset. Should be aligned with the args when running conversion tools.
open_tracer: true
# process schedule: a list of several process operators with their arguments
process:
- fix_unicode_mapper: # fix unicode errors in text.
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
# 558128
# Filter ops
- alphanumeric_filter: #558087 # filter text with alphabet/numeric ratio out of specific range.
tokenization: false # Whether to count the ratio of alphanumeric to the total number of tokens.
min_ratio: 0.60 # the min ratio of filter range
- character_repetition_filter: #546105 # filter text with the character repetition ratio out of specific range
rep_len: 10 # repetition length for char-level n-gram
max_ratio: 0.09373663 # the max ratio of filter range
- flagged_words_filter: #543960 # filter text with the flagged-word ratio larger than a specific max value
lang: en # consider flagged words in what language
tokenization: false # whether to use model to tokenize documents
max_ratio: 0.0 # the max ratio to filter text
- perplexity_filter: #532029 # filter text with perplexity score out of specific range
lang: en # compute perplexity in what language
max_ppl: 14435.5806 # the max perplexity score to filter text
- special_characters_filter: #531968 # filter text with special-char ratio out of specific range
min_ratio: 0.16534802 # the min ratio of filter range
max_ratio: 0.42023757 # the max ratio of filter range
- word_repetition_filter: # 530773 # filter text with the word repetition ratio out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
rep_len: 10 # repetition length for word-level n-gram
max_ratio: 0.03085751 # the max ratio of filter range
- image_aspect_ratio_filter: #542389 # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_shape_filter: #533966 # filter samples according to the widths and heights of images in them
max_width: 727.8798422276 # the max width of width filter range
max_height: 606.2421072264 # the max height of height filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # 533966 # filter samples according to the size of images (in bytes) within them
max_size: "124KB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_text_similarity_filter: #544202 # filter samples according to the similarity between text and images.
hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip
min_score: 0.20315419 # the min similarity of filter range
- image_text_matching_filter: # filter samples according to the matching score between image and text.
hf_blip: Salesforce/blip-itm-base-coco # name of used Hugging Face blip
min_score: 0.44930778 # the min matching score of filter range