Skip to content

Commit

Permalink
mix
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Dec 30, 2024
1 parent c944b6a commit 2b6b5f7
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 54 deletions.
65 changes: 11 additions & 54 deletions configs/cc-news/dedupe_by_year.sh
Original file line number Diff line number Diff line change
@@ -1,64 +1,19 @@
#! /usr/bin/env bash

# documents:
# - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz

# dedupe:
# name: dedupe_para_ngrams_13_1
# paragraphs:
# attribute_name: dedupe_para_ngrams_13_1
# by_ngram:
# ngram_length: 13
# stride: 1
# overlap_threshold: 0.5
# skip_empty: true

# bloom_filter:
# file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin
# read_only: false
# # estimated doc count is obtained by counting number of words in paragraphs
# # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram)
# estimated_doc_count: 359_916_731_334
# desired_false_positive_rate: 0.1

# processes: 188
# work_dir:
# input: /tmp/c4_dedupe_para_ngrams_13_1/input
# output: /tmp/c4_dedupe_para_ngrams_13_1/output
base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents"

# run years between 2016 and 2024
for year in {2016..2024}; do

# Initialize an empty array to store document paths and a variable for total size
documents=()
size=0

# Collect all month document paths into the array and accumulate size
for month in {1..12}; do
# Skip months after 7 if year is 2024
if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
continue
fi

# Skip months before 8 if year is 2016
if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
continue
fi

# Format month as 2 digits
month=$(printf "%02d" $month)

# Add the document path for this month to the array
documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/*.zst")

# Get the size for this month and add it to the total size
month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc)
size=$((size + month_size))
done

while IFS= read -r -d '' file; do
documents+=("$file")
size=$(expr $size + $(stat -c %s "$file"))
done < <(find "${base_dir}/${year}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0)

# run deduplication
echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter"
echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter (files: ${#documents[@]})"

# Start the output
document_linearized="documents:\n"
Expand All @@ -85,11 +40,11 @@ bloom_filter:
file: /tmp/cc_news_${year}_dedupe_ngram.bin
read_only: false
estimated_doc_count: ${size}
desired_false_positive_rate: 0.01
desired_false_positive_rate: 0.1
work_dir:
input: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/input
output: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/output
input: /tmp/cc_news_${year}_dedupe_ngrams_20_1/input
output: /tmp/cc_news_${year}_dedupe_ngrams_20_1/output
EOF
)

Expand All @@ -106,7 +61,9 @@ EOF
dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4)
set +ex


# Remove the temporary file
rm "$temp_config_file"
rm -rf "/tmp/cc_news_${year}*"

done
78 changes: 78 additions & 0 deletions configs/cc-news/mix-deupe-by-year.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
streams:
- name: cc-news_2016
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016/*.json.zst
attributes: &attributes
- dedupe_ngrams_20_1
output: &output
max_size_in_bytes: 2_500_000_000
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year-dedupe/documents
filter: &filter
include:
- >-
(.attributes.dedupe_ngrams_20_1 | length == 0) or
((.attributes.dedupe_ngrams_20_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
syntax: jq

- name: cc-news_2017
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2018
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2019
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2020
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2021
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2022
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2023
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023/*.json.zst
attributes: *attributes
output: *output
filter: *filter

- name: cc-news_2024
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024/*.json.zst
attributes: *attributes
output: *output
filter: *filter


work_dir:
input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/input
output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/output

processes: 188

0 comments on commit 2b6b5f7

Please sign in to comment.