diff --git a/configs/cc-news/dedupe_by_year.sh b/configs/cc-news/dedupe_by_year.sh index 66c7993c..321af0b4 100644 --- a/configs/cc-news/dedupe_by_year.sh +++ b/configs/cc-news/dedupe_by_year.sh @@ -1,64 +1,19 @@ #! /usr/bin/env bash -# documents: -# - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz - -# dedupe: -# name: dedupe_para_ngrams_13_1 -# paragraphs: -# attribute_name: dedupe_para_ngrams_13_1 -# by_ngram: -# ngram_length: 13 -# stride: 1 -# overlap_threshold: 0.5 -# skip_empty: true - -# bloom_filter: -# file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin -# read_only: false -# # estimated doc count is obtained by counting number of words in paragraphs -# # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram) -# estimated_doc_count: 359_916_731_334 -# desired_false_positive_rate: 0.1 - -# processes: 188 -# work_dir: -# input: /tmp/c4_dedupe_para_ngrams_13_1/input -# output: /tmp/c4_dedupe_para_ngrams_13_1/output +base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents" # run years between 2016 and 2024 for year in {2016..2024}; do - # Initialize an empty array to store document paths and a variable for total size documents=() size=0 - - # Collect all month document paths into the array and accumulate size - for month in {1..12}; do - # Skip months after 7 if year is 2024 - if [ $year -eq 2024 ] && [ $month -gt 7 ]; then - continue - fi - - # Skip months before 8 if year is 2016 - if [ $year -eq 2016 ] && [ $month -lt 8 ]; then - continue - fi - - # Format month as 2 digits - month=$(printf "%02d" $month) - - # Add the document path for this month to the array - documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/*.zst") - - # Get the size for this month and add it to the total size - month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc) - size=$((size + month_size)) - done - + while IFS= read -r -d '' file; do + documents+=("$file") + size=$(expr $size + $(stat -c %s "$file")) + done < <(find "${base_dir}/${year}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0) # run deduplication - echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter" + echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter (files: ${#documents[@]})" # Start the output document_linearized="documents:\n" @@ -85,11 +40,11 @@ bloom_filter: file: /tmp/cc_news_${year}_dedupe_ngram.bin read_only: false estimated_doc_count: ${size} - desired_false_positive_rate: 0.01 + desired_false_positive_rate: 0.1 work_dir: - input: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/input - output: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/output + input: /tmp/cc_news_${year}_dedupe_ngrams_20_1/input + output: /tmp/cc_news_${year}_dedupe_ngrams_20_1/output EOF ) @@ -106,7 +61,9 @@ EOF dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4) set +ex + # Remove the temporary file rm "$temp_config_file" + rm -rf "/tmp/cc_news_${year}*" done diff --git a/configs/cc-news/mix-deupe-by-year.yaml b/configs/cc-news/mix-deupe-by-year.yaml new file mode 100644 index 00000000..85856a82 --- /dev/null +++ b/configs/cc-news/mix-deupe-by-year.yaml @@ -0,0 +1,78 @@ +streams: + - name: cc-news_2016 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016/*.json.zst + attributes: &attributes + - dedupe_ngrams_20_1 + output: &output + max_size_in_bytes: 2_500_000_000 + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year-dedupe/documents + filter: &filter + include: + - >- + (.attributes.dedupe_ngrams_20_1 | length == 0) or + ((.attributes.dedupe_ngrams_20_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3) + syntax: jq + + - name: cc-news_2017 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2018 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2019 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2020 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2021 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2022 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2023 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + - name: cc-news_2024 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024/*.json.zst + attributes: *attributes + output: *output + filter: *filter + + +work_dir: + input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/input + output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/output + +processes: 188