mix

allenai · Dec 30, 2024 · 2b6b5f7 · 2b6b5f7
1 parent c944b6a
commit 2b6b5f7
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 54 deletions.
diff --git a/configs/cc-news/dedupe_by_year.sh b/configs/cc-news/dedupe_by_year.sh
@@ -1,64 +1,19 @@
 #! /usr/bin/env bash
 
-# documents:
-#   - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz
-
-# dedupe:
-#   name: dedupe_para_ngrams_13_1
-#   paragraphs:
-#     attribute_name: dedupe_para_ngrams_13_1
-#     by_ngram:
-#       ngram_length: 13
-#       stride: 1
-#       overlap_threshold: 0.5
-#   skip_empty: true
-
-# bloom_filter:
-#   file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin
-#   read_only: false
-#   # estimated doc count is obtained by counting number of words in paragraphs
-#   # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram)
-#   estimated_doc_count: 359_916_731_334
-#   desired_false_positive_rate: 0.1
-
-# processes: 188
-# work_dir:
-#   input: /tmp/c4_dedupe_para_ngrams_13_1/input
-#   output: /tmp/c4_dedupe_para_ngrams_13_1/output
+base_dir="${HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents"
 
 # run years between 2016 and 2024
 for year in {2016..2024}; do
-
     # Initialize an empty array to store document paths and a variable for total size
     documents=()
     size=0
-
-    # Collect all month document paths into the array and accumulate size
-    for month in {1..12}; do
-        # Skip months after 7 if year is 2024
-        if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
-            continue
-        fi
-
-        # Skip months before 8 if year is 2016
-        if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
-            continue
-        fi
-
-        # Format month as 2 digits
-        month=$(printf "%02d" $month)
-
-        # Add the document path for this month to the array
-        documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/*.zst")
-
-        # Get the size for this month and add it to the total size
-        month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v1-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc)
-        size=$((size + month_size))
-    done
-
+    while IFS= read -r -d '' file; do
+      documents+=("$file")
+      size=$(expr $size + $(stat -c %s "$file"))
+    done < <(find "${base_dir}/${year}" -type f \( -name "*.zst" -o -name "*.gz" -o -name "*.gzip" -o -name "*.json" -o -name "*.jsonl" \) -print0)
 
     # run deduplication
-    echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter"
+    echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter (files: ${#documents[@]})"
 
     # Start the output
     document_linearized="documents:\n"
@@ -85,11 +40,11 @@ bloom_filter:
   file: /tmp/cc_news_${year}_dedupe_ngram.bin
   read_only: false
   estimated_doc_count: ${size}
-  desired_false_positive_rate: 0.01
+  desired_false_positive_rate: 0.1
 
 work_dir:
-  input: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/input
-  output: /tmp/cc_news_${year}_dedupe_para_ngrams_13_1/output
+  input: /tmp/cc_news_${year}_dedupe_ngrams_20_1/input
+  output: /tmp/cc_news_${year}_dedupe_ngrams_20_1/output
 EOF
 )
 
@@ -106,7 +61,9 @@ EOF
     dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4)
     set +ex
 
+
     # Remove the temporary file
     rm "$temp_config_file"
+    rm -rf "/tmp/cc_news_${year}*"
 
 done
diff --git a/configs/cc-news/mix-deupe-by-year.yaml b/configs/cc-news/mix-deupe-by-year.yaml
@@ -0,0 +1,78 @@
+streams:
+    - name: cc-news_2016
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016/*.json.zst
+      attributes: &attributes
+          - dedupe_ngrams_20_1
+      output: &output
+          max_size_in_bytes: 2_500_000_000
+          path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year-dedupe/documents
+      filter: &filter
+        include:
+            - >-
+              (.attributes.dedupe_ngrams_20_1 | length == 0) or
+              ((.attributes.dedupe_ngrams_20_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) <= 0.3)
+        syntax: jq
+
+    - name: cc-news_2017
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2018
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2019
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2020
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2021
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2022
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2023
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+    - name: cc-news_2024
+      documents:
+          - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024/*.json.zst
+      attributes: *attributes
+      output: *output
+      filter: *filter
+
+
+work_dir:
+  input: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/input
+  output: ${oc.env:HOME}/ai2-llm/work_dir/cc-news/v1-resiliparse-year/output
+
+processes: 188