-
Notifications
You must be signed in to change notification settings - Fork 120
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
3,116 additions
and
2,245 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#! /usr/bin/env bash | ||
|
||
# documents: | ||
# - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz | ||
|
||
# dedupe: | ||
# name: dedupe_para_ngrams_13_1 | ||
# paragraphs: | ||
# attribute_name: dedupe_para_ngrams_13_1 | ||
# by_ngram: | ||
# ngram_length: 13 | ||
# stride: 1 | ||
# overlap_threshold: 0.5 | ||
# skip_empty: true | ||
|
||
# bloom_filter: | ||
# file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin | ||
# read_only: false | ||
# # estimated doc count is obtained by counting number of words in paragraphs | ||
# # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram) | ||
# estimated_doc_count: 359_916_731_334 | ||
# desired_false_positive_rate: 0.1 | ||
|
||
# processes: 188 | ||
# work_dir: | ||
# input: /tmp/c4_dedupe_para_ngrams_13_1/input | ||
# output: /tmp/c4_dedupe_para_ngrams_13_1/output | ||
|
||
# run years between 2016 and 2024 | ||
for year in {2016..2024}; do | ||
# run months between 1 and 12 | ||
|
||
# Initialize an empty array to store document paths and a variable for total size | ||
documents=() | ||
size=0 | ||
|
||
# Collect all month document paths into the array and accumulate size | ||
for month in {1..12}; do | ||
# Skip months after 7 if year is 2024 | ||
if [ $year -eq 2024 ] && [ $month -gt 7 ]; then | ||
continue | ||
fi | ||
|
||
# Skip months before 8 if year is 2016 | ||
if [ $year -eq 2016 ] && [ $month -lt 8 ]; then | ||
continue | ||
fi | ||
|
||
# Format month as 2 digits | ||
month=$(printf "%02d" $month) | ||
|
||
# Add the document path for this month to the array | ||
documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/*.zst") | ||
|
||
# Get the size for this month and add it to the total size | ||
month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc) | ||
size=$((size + month_size)) | ||
done | ||
|
||
|
||
# run deduplication | ||
echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter" | ||
|
||
# Start the output | ||
document_linearized="documents:\n" | ||
|
||
# Loop through the array and append each element | ||
for doc in "${documents[@]}"; do | ||
document_linearized+=" - $doc\n" | ||
done | ||
|
||
config_yaml=$(cat <<EOF | ||
${document_linearized} | ||
dedupe: | ||
name: dedupe_by_year | ||
paragraphs: | ||
attribute_name: dedupe_ngrams_13_1 | ||
by_ngram: | ||
ngram_length: 13 | ||
stride: 1 | ||
overlap_threshold: 0.5 | ||
skip_short_paragraphs: true | ||
skip_empty: true | ||
bloom_filter: | ||
file: /tmp/cc_news_${year}_dedupe_ngram.bin | ||
read_only: false | ||
estimated_doc_count: ${size} | ||
desired_false_positive_rate: 0.1 | ||
work_dir: | ||
input: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/input | ||
output: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/output | ||
EOF | ||
) | ||
|
||
|
||
# Create a temporary file for the YAML config | ||
temp_config_file=$(mktemp) | ||
|
||
# Write the YAML config to the temporary file | ||
printf "$config_yaml" > "$temp_config_file" | ||
|
||
|
||
set -ex | ||
# Run dolma with the temporary config file | ||
dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4) | ||
set +ex | ||
|
||
# Remove the temporary file | ||
rm "$temp_config_file" | ||
|
||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
streams: | ||
- name: cc-news_2016 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-12/*zst | ||
|
||
output: &output | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016 | ||
max_size_in_bytes: 1_000_000_000 | ||
|
||
compression: &compression | ||
input: zst | ||
output: zst | ||
|
||
attributes: &attributes | ||
- dedupe_by_year | ||
- dolma_v2_tokenizer | ||
|
||
filter: &filter | ||
exclude: | ||
- >- | ||
(.attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100) | ||
- >- | ||
(.attributes.dedupe_ngrams_13_1 | length > 0) and | ||
((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) >= 0.5) | ||
syntax: jq | ||
|
||
- name: cc-news_2017 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2018 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2019 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2020 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2021 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2022 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2023 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-07/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-08/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-09/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-10/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-11/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-12/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
- name: cc-news_2024 | ||
documents: | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-01/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-02/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-03/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-04/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-05/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-06/*zst | ||
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-07/*zst | ||
output: | ||
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024 | ||
<<: *output | ||
|
||
compression: *compression | ||
attributes: *attributes | ||
filter: *filter | ||
|
||
|
||
processes: 1 |
Oops, something went wrong.