Skip to content

Commit

Permalink
Add preliminary Dolma v1.7 configurations, fix corner case in tokens. (
Browse files Browse the repository at this point in the history
…#120)

* data

* added configs

* adding experiments for blocklist

* fixed bug in tokenizer

* removed models dir for now
  • Loading branch information
soldni authored Feb 13, 2024
1 parent b9a8b94 commit c6c0b47
Show file tree
Hide file tree
Showing 11 changed files with 510 additions and 19 deletions.
97 changes: 97 additions & 0 deletions configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@

streams:
- name: books
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz
attributes: &attributes
- paloma_paragraphs
- paloma_documents
- random_number_v1
output: &output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/books
max_size_in_bytes: 4294967296
discard_fields:
- attributes
filter: &filter
include:
- "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.10)]"
exclude:
- "[email protected][?(@.paloma_documents_bff_duplicates && @.paloma_documents_bff_duplicates[0] && @.paloma_documents_bff_duplicates[0][2] >= 1.0)]"
- "[email protected][?(@.paloma_paragraphs_bff_duplicates && @.paloma_paragraphs_bff_duplicates[0] && @.paloma_paragraphs_bff_duplicates[0][2] >= 1.0)]"

- name: c4
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/c4
filter: *filter

- name: cc_en_head
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_head
filter: *filter

- name: cc_en_middle
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_middle
filter: *filter

- name: cc_en_tail
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_tail
filter: *filter

- name: pes2o
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/pes2o
filter: *filter

- name: reddit
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/reddit
filter: *filter

- name: stack
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/stack
filter: *filter

- name: wiki
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/wiki
filter: *filter

work_dir:
input: "/tmp/olmo-mix-v1_5/input"
output: "/tmp/olmo-mix-v1_5/output"
processes: 188
23 changes: 23 additions & 0 deletions configs/dolma-v1_7/v1_5-baseline/300g_tok.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_5-300G-decon/gpt-neox-olmo-dolma-v1_6
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/books
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/c4
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_head
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_middle
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_tail
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/pes2o
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/reddit
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/stack
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/wiki
# - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz

processes: 40
seed: 3920
max_size: 21_474_836_480

tokenizer:
name_or_path: allenai/gpt-neox-olmo-dolma-v1_5
bos_token_id: null
eos_token_id: 50279
pad_token_id: 1
segment_before_tokenization: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz


dedupe:
name: paloma_documents
documents:
attribute_name: paloma_documents_bff_duplicates
key: $.text
skip_empty: true

bloom_filter:
read_only: true
estimated_doc_count: 188815
desired_false_positive_rate: 1e-15
file: ${oc.env:HOME}/perplexity/filters/paloma_documents.bin

processes: 94
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz
# - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz
# - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz


dedupe:
name: paloma_paragraphs
paragraphs:
attribute_name: paloma_paragraphs_bff_duplicates
skip_empty: true

bloom_filter:
read_only: true
estimated_doc_count: 2336120
# size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs
desired_false_positive_rate: 1e-15
# file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin
file: ${oc.env:HOME}/perplexity/filters/paloma_paragraphs.bin

processes: 94
96 changes: 96 additions & 0 deletions configs/dolma-v1_7/v1_6-baseline/300g_sample.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
streams:
- name: books
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz
attributes: &attributes
- paloma_paragraphs
- paloma_documents
- random_number_v1
output: &output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/books
max_size_in_bytes: 38949672960
discard_fields:
- attributes
filter: &filter
include:
- "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.10)]"
exclude:
- "[email protected][?(@.paloma_documents_bff_duplicates && @.paloma_documents_bff_duplicates[0] && @.paloma_documents_bff_duplicates[0][2] >= 1.0)]"
- "[email protected][?(@.paloma_paragraphs_bff_duplicates && @.paloma_paragraphs_bff_duplicates[0] && @.paloma_paragraphs_bff_duplicates[0][2] >= 1.0)]"

- name: c4
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/c4
filter: *filter

- name: cc_en_head
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_head
filter: *filter

- name: cc_en_middle
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_middle
filter: *filter

- name: cc_en_tail
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_tail
filter: *filter

- name: pes2o
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/pes2o
filter: *filter

- name: reddit
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/reddit
filter: *filter

- name: stack
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/stack
filter: *filter

- name: wiki
documents:
- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz
attributes: *attributes
output:
<<: *output
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/wiki
filter: *filter

work_dir:
input: "/tmp/olmo-mix-v1_6/input"
output: "/tmp/olmo-mix-v1_6/output"
processes: 188
23 changes: 23 additions & 0 deletions configs/dolma-v1_7/v1_6-baseline/300g_tok.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_6-300G-decon/gpt-neox-olmo-dolma-v1_6
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/books
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/c4
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_head
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_middle
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_tail
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/pes2o
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/reddit
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/stack
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/wiki
# - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz

processes: 40
seed: 3920
max_size: 21_474_836_480

tokenizer:
name_or_path: allenai/gpt-neox-olmo-dolma-v1_5
bos_token_id: null
eos_token_id: 50279
pad_token_id: 1
segment_before_tokenization: false
31 changes: 31 additions & 0 deletions configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/env bash

sources=(
"books,1"
"c4,5"
"cc_en_head,10"
"cc_en_middle,10"
"cc_en_tail,10"
"pes2o,3"
"reddit,2"
"stack,5"
"wiki,1"
)

set -x

for i in "${!sources[@]}"; do
# split source and number of processes
source=$(echo "${sources[$i]}" | cut -d',' -f1)
processes=$(echo "${sources[$i]}" | cut -d',' -f2)

dolma tokens \
--destination "s3://ai2-llm/preprocessed/olmo-mix/v1_6-300G-decon/gpt-neox-olmo-dolma-v1_6_persource/${source}" \
--documents "${HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/${source}" \
--tokenizer.name_or_path "allenai/gpt-neox-olmo-dolma-v1_5" \
--tokenizer.eos_token_id 50279 \
--tokenizer.pad_token_id 1 \
--processes ${processes} \
--seed 3920 \
--max_size "21_474_836_480"
done
Loading

0 comments on commit c6c0b47

Please sign in to comment.