Skip to content

Commit

Permalink
configs
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Nov 12, 2024
1 parent 4c3cab6 commit da4957c
Show file tree
Hide file tree
Showing 5 changed files with 3,116 additions and 2,245 deletions.
File renamed without changes.
114 changes: 114 additions & 0 deletions configs/cc-news/dedupe-year.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#! /usr/bin/env bash

# documents:
# - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz

# dedupe:
# name: dedupe_para_ngrams_13_1
# paragraphs:
# attribute_name: dedupe_para_ngrams_13_1
# by_ngram:
# ngram_length: 13
# stride: 1
# overlap_threshold: 0.5
# skip_empty: true

# bloom_filter:
# file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin
# read_only: false
# # estimated doc count is obtained by counting number of words in paragraphs
# # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram)
# estimated_doc_count: 359_916_731_334
# desired_false_positive_rate: 0.1

# processes: 188
# work_dir:
# input: /tmp/c4_dedupe_para_ngrams_13_1/input
# output: /tmp/c4_dedupe_para_ngrams_13_1/output

# run years between 2016 and 2024
for year in {2016..2024}; do
# run months between 1 and 12

# Initialize an empty array to store document paths and a variable for total size
documents=()
size=0

# Collect all month document paths into the array and accumulate size
for month in {1..12}; do
# Skip months after 7 if year is 2024
if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
continue
fi

# Skip months before 8 if year is 2016
if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
continue
fi

# Format month as 2 digits
month=$(printf "%02d" $month)

# Add the document path for this month to the array
documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/*.zst")

# Get the size for this month and add it to the total size
month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc)
size=$((size + month_size))
done


# run deduplication
echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter"

# Start the output
document_linearized="documents:\n"

# Loop through the array and append each element
for doc in "${documents[@]}"; do
document_linearized+=" - $doc\n"
done

config_yaml=$(cat <<EOF
${document_linearized}
dedupe:
name: dedupe_by_year
paragraphs:
attribute_name: dedupe_ngrams_13_1
by_ngram:
ngram_length: 13
stride: 1
overlap_threshold: 0.5
skip_short_paragraphs: true
skip_empty: true
bloom_filter:
file: /tmp/cc_news_${year}_dedupe_ngram.bin
read_only: false
estimated_doc_count: ${size}
desired_false_positive_rate: 0.1
work_dir:
input: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/input
output: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/output
EOF
)


# Create a temporary file for the YAML config
temp_config_file=$(mktemp)

# Write the YAML config to the temporary file
printf "$config_yaml" > "$temp_config_file"


set -ex
# Run dolma with the temporary config file
dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4)
set +ex

# Remove the temporary file
rm "$temp_config_file"

done
done
21 changes: 8 additions & 13 deletions configs/cc-news/make_lang_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,28 @@
SRC_BASE = "s3://ai2-llm/pretraining-data/sources/cc-news"
SRC_PRFX = "v1-resiliparse"
LANG_THR = 100_000
DST_BASE = "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news"
DST_BASE = "s3://ai2-llm/pretraining-data/sources/cc-news"
DST_PRFX = f"v2-resiliparse-l{LANG_THR // 1000}k"


def base_stream_config(lang: str, year: int, months: List[int]):
return {
"name": f"cc-news_{year:04d}_{lang}",
"documents": [
f"{SRC_BASE}/{SRC_PRFX}/documents/{year:04d}-{month:02d}/*.zst"
for month in months
],
"documents": [f"{SRC_BASE}/{SRC_PRFX}/documents/{year:04d}-{month:02d}/*.zst" for month in months],
"compression": {"input": "zst", "output": "zst"},
"output": {
"path": f"{DST_BASE}/{DST_PRFX}/documents/{lang}/{year:04d}",
"max_size_in_bytes": 10_000_000_000,
},
"attributes": ["ft_lang_id_1e2", "dolma_v2_tokenizer"],
"filter": {
"include": [
"include": [],
"exclude": [
# at least 100 tokens
".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100",
# make sure the language is present and the confidence is high enough and that it is the highest confidence
(
f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang} != null) and "
+ f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang}[0][-1] >= 0.5) and "
+ f'((.attributes | to_entries | map(select(.key | startswith("ft_lang_id_1e2__ft_lang_id_1e2__"))) | max_by(.value) | .key ) == "ft_lang_id_1e2__ft_lang_id_1e2__{lang}")'
),
".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100",
# no language detected or low confidence
f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang} == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang}[0][-1] < 0.5)",

],
"syntax": "jq",
},
Expand Down
204 changes: 204 additions & 0 deletions configs/cc-news/mix_v1-year.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
streams:
- name: cc-news_2016
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-12/*zst

output: &output
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016
max_size_in_bytes: 1_000_000_000

compression: &compression
input: zst
output: zst

attributes: &attributes
- dedupe_by_year
- dolma_v2_tokenizer

filter: &filter
exclude:
- >-
(.attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100)
- >-
(.attributes.dedupe_ngrams_13_1 | length > 0) and
((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) >= 0.5)
syntax: jq

- name: cc-news_2017
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2018
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2019
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2020
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2021
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2022
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2023
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-07/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-08/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-09/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-10/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-11/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-12/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023
<<: *output

compression: *compression
attributes: *attributes
filter: *filter

- name: cc-news_2024
documents:
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-01/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-02/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-03/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-04/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-05/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-06/*zst
- s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-07/*zst
output:
path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024
<<: *output

compression: *compression
attributes: *attributes
filter: *filter


processes: 1
Loading

0 comments on commit da4957c

Please sign in to comment.