diff --git a/configs/cc-news/dedupe.sh b/configs/cc-news/dedupe-month.sh similarity index 100% rename from configs/cc-news/dedupe.sh rename to configs/cc-news/dedupe-month.sh diff --git a/configs/cc-news/dedupe-year.sh b/configs/cc-news/dedupe-year.sh new file mode 100644 index 00000000..0e238953 --- /dev/null +++ b/configs/cc-news/dedupe-year.sh @@ -0,0 +1,114 @@ +#! /usr/bin/env bash + +# documents: +# - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + +# dedupe: +# name: dedupe_para_ngrams_13_1 +# paragraphs: +# attribute_name: dedupe_para_ngrams_13_1 +# by_ngram: +# ngram_length: 13 +# stride: 1 +# overlap_threshold: 0.5 +# skip_empty: true + +# bloom_filter: +# file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin +# read_only: false +# # estimated doc count is obtained by counting number of words in paragraphs +# # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram) +# estimated_doc_count: 359_916_731_334 +# desired_false_positive_rate: 0.1 + +# processes: 188 +# work_dir: +# input: /tmp/c4_dedupe_para_ngrams_13_1/input +# output: /tmp/c4_dedupe_para_ngrams_13_1/output + +# run years between 2016 and 2024 +for year in {2016..2024}; do + # run months between 1 and 12 + + # Initialize an empty array to store document paths and a variable for total size + documents=() + size=0 + + # Collect all month document paths into the array and accumulate size + for month in {1..12}; do + # Skip months after 7 if year is 2024 + if [ $year -eq 2024 ] && [ $month -gt 7 ]; then + continue + fi + + # Skip months before 8 if year is 2016 + if [ $year -eq 2016 ] && [ $month -lt 8 ]; then + continue + fi + + # Format month as 2 digits + month=$(printf "%02d" $month) + + # Add the document path for this month to the array + documents+=("s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/*.zst") + + # Get the size for this month and add it to the total size + month_size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc) + size=$((size + month_size)) + done + + + # run deduplication + echo "Running fuzzy dedupe for ${year} with ${size} bytes Bloom filter" + + # Start the output + document_linearized="documents:\n" + + # Loop through the array and append each element + for doc in "${documents[@]}"; do + document_linearized+=" - $doc\n" + done + + config_yaml=$(cat < "$temp_config_file" + + + set -ex + # Run dolma with the temporary config file + dolma -c "$temp_config_file" dedupe --processes $(expr $(nproc) - 4) + set +ex + + # Remove the temporary file + rm "$temp_config_file" + + done +done diff --git a/configs/cc-news/make_lang_partition.py b/configs/cc-news/make_lang_partition.py index 85b175e1..217a4f36 100644 --- a/configs/cc-news/make_lang_partition.py +++ b/configs/cc-news/make_lang_partition.py @@ -6,17 +6,14 @@ SRC_BASE = "s3://ai2-llm/pretraining-data/sources/cc-news" SRC_PRFX = "v1-resiliparse" LANG_THR = 100_000 -DST_BASE = "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news" +DST_BASE = "s3://ai2-llm/pretraining-data/sources/cc-news" DST_PRFX = f"v2-resiliparse-l{LANG_THR // 1000}k" def base_stream_config(lang: str, year: int, months: List[int]): return { "name": f"cc-news_{year:04d}_{lang}", - "documents": [ - f"{SRC_BASE}/{SRC_PRFX}/documents/{year:04d}-{month:02d}/*.zst" - for month in months - ], + "documents": [f"{SRC_BASE}/{SRC_PRFX}/documents/{year:04d}-{month:02d}/*.zst" for month in months], "compression": {"input": "zst", "output": "zst"}, "output": { "path": f"{DST_BASE}/{DST_PRFX}/documents/{lang}/{year:04d}", @@ -24,15 +21,13 @@ def base_stream_config(lang: str, year: int, months: List[int]): }, "attributes": ["ft_lang_id_1e2", "dolma_v2_tokenizer"], "filter": { - "include": [ + "include": [], + "exclude": [ # at least 100 tokens - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - # make sure the language is present and the confidence is high enough and that it is the highest confidence - ( - f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang} != null) and " - + f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang}[0][-1] >= 0.5) and " - + f'((.attributes | to_entries | map(select(.key | startswith("ft_lang_id_1e2__ft_lang_id_1e2__"))) | max_by(.value) | .key ) == "ft_lang_id_1e2__ft_lang_id_1e2__{lang}")' - ), + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + # no language detected or low confidence + f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang} == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang}[0][-1] < 0.5)", + ], "syntax": "jq", }, diff --git a/configs/cc-news/mix_v1-year.yaml b/configs/cc-news/mix_v1-year.yaml new file mode 100644 index 00000000..048050ed --- /dev/null +++ b/configs/cc-news/mix_v1-year.yaml @@ -0,0 +1,204 @@ +streams: + - name: cc-news_2016 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2016-12/*zst + + output: &output + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2016 + max_size_in_bytes: 1_000_000_000 + + compression: &compression + input: zst + output: zst + + attributes: &attributes + - dedupe_by_year + - dolma_v2_tokenizer + + filter: &filter + exclude: + - >- + (.attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100) + - >- + (.attributes.dedupe_ngrams_13_1 | length > 0) and + ((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) >= 0.5) + + syntax: jq + + - name: cc-news_2017 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2017-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2017 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2018 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2018-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2018 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2019 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2019-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2019 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2020 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2020-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2020 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2021 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2021-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2021 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2022 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2022-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2022 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2023 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-07/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-08/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-09/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-10/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-11/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2023-12/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2023 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + - name: cc-news_2024 + documents: + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-01/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-02/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-03/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-04/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-05/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-06/*zst + - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/2024-07/*zst + output: + path: s3://ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse-year/documents/2024 + <<: *output + + compression: *compression + attributes: *attributes + filter: *filter + + +processes: 1 diff --git a/configs/cc-news/mix_v2.json b/configs/cc-news/mix_v2.json index b59009fe..1214264c 100644 --- a/configs/cc-news/mix_v2.json +++ b/configs/cc-news/mix_v2.json @@ -15,7 +15,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -23,9 +23,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -44,7 +45,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -52,9 +53,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -73,7 +75,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -81,9 +83,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -102,7 +105,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -110,9 +113,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -131,7 +135,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -139,9 +143,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -160,7 +165,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -168,9 +173,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -189,7 +195,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -197,9 +203,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -218,7 +225,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -226,9 +233,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -247,7 +255,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -255,9 +263,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -276,7 +285,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -284,9 +293,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -305,7 +315,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -313,9 +323,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -334,7 +345,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -342,9 +353,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -363,7 +375,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -371,9 +383,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -392,7 +405,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -400,9 +413,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -421,7 +435,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -429,9 +443,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -450,7 +465,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -458,9 +473,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -479,7 +495,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -487,9 +503,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -508,7 +525,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -516,9 +533,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -537,7 +555,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -545,9 +563,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -566,7 +585,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -574,9 +593,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -595,7 +615,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -603,9 +623,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -624,7 +645,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -632,9 +653,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -653,7 +675,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -661,9 +683,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -682,7 +705,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -690,9 +713,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -711,7 +735,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -719,9 +743,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -740,7 +765,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -748,9 +773,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -769,7 +795,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -777,9 +803,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -798,7 +825,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -806,9 +833,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -827,7 +855,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -835,9 +863,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -856,7 +885,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -864,9 +893,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -885,7 +915,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -893,9 +923,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -914,7 +945,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -922,9 +953,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -943,7 +975,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -951,9 +983,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -972,7 +1005,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -980,9 +1013,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1001,7 +1035,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1009,9 +1043,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1030,7 +1065,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1038,9 +1073,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1059,7 +1095,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1067,9 +1103,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1088,7 +1125,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1096,9 +1133,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1117,7 +1155,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1125,9 +1163,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1146,7 +1185,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1154,9 +1193,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1175,7 +1215,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1183,9 +1223,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1204,7 +1245,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1212,9 +1253,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1233,7 +1275,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1241,9 +1283,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1262,7 +1305,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1270,9 +1313,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1291,7 +1335,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1299,9 +1343,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1320,7 +1365,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1328,9 +1373,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1349,7 +1395,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1357,9 +1403,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1378,7 +1425,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1386,9 +1433,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1407,7 +1455,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1415,9 +1463,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1436,7 +1485,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1444,9 +1493,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1465,7 +1515,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1473,9 +1523,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1494,7 +1545,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1502,9 +1553,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1523,7 +1575,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1531,9 +1583,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1552,7 +1605,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1560,9 +1613,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1581,7 +1635,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1589,9 +1643,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1610,7 +1665,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1618,9 +1673,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1639,7 +1695,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1647,9 +1703,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1668,7 +1725,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1676,9 +1733,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1697,7 +1755,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1705,9 +1763,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1726,7 +1785,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1734,9 +1793,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1755,7 +1815,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1763,9 +1823,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1784,7 +1845,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2016", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2016", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1792,9 +1853,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1820,7 +1882,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1828,9 +1890,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1856,7 +1919,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1864,9 +1927,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1892,7 +1956,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1900,9 +1964,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1928,7 +1993,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1936,9 +2001,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -1964,7 +2030,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -1972,9 +2038,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2000,7 +2067,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2008,9 +2075,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2036,7 +2104,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2044,9 +2112,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2072,7 +2141,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2080,9 +2149,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2108,7 +2178,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2116,9 +2186,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2144,7 +2215,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2152,9 +2223,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2180,7 +2252,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2188,9 +2260,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2216,7 +2289,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2224,9 +2297,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2252,7 +2326,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2260,9 +2334,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2288,7 +2363,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2296,9 +2371,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2324,7 +2400,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2332,9 +2408,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2360,7 +2437,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2368,9 +2445,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2396,7 +2474,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2404,9 +2482,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2432,7 +2511,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2440,9 +2519,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2468,7 +2548,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2476,9 +2556,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2504,7 +2585,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2512,9 +2593,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2540,7 +2622,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2548,9 +2630,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2576,7 +2659,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2584,9 +2667,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2612,7 +2696,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2620,9 +2704,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2648,7 +2733,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2656,9 +2741,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2684,7 +2770,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2692,9 +2778,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2720,7 +2807,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2728,9 +2815,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2756,7 +2844,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2764,9 +2852,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2792,7 +2881,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2800,9 +2889,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2828,7 +2918,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2836,9 +2926,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2864,7 +2955,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2872,9 +2963,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2900,7 +2992,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2908,9 +3000,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2936,7 +3029,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2944,9 +3037,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -2972,7 +3066,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -2980,9 +3074,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3008,7 +3103,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3016,9 +3111,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3044,7 +3140,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3052,9 +3148,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3080,7 +3177,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3088,9 +3185,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3116,7 +3214,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3124,9 +3222,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3152,7 +3251,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3160,9 +3259,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3188,7 +3288,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3196,9 +3296,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3224,7 +3325,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3232,9 +3333,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3260,7 +3362,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3268,9 +3370,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3296,7 +3399,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3304,9 +3407,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3332,7 +3436,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3340,9 +3444,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3368,7 +3473,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3376,9 +3481,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3404,7 +3510,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3412,9 +3518,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3440,7 +3547,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3448,9 +3555,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3476,7 +3584,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3484,9 +3592,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3512,7 +3621,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3520,9 +3629,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3548,7 +3658,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3556,9 +3666,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3584,7 +3695,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3592,9 +3703,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3620,7 +3732,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3628,9 +3740,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3656,7 +3769,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3664,9 +3777,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3692,7 +3806,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3700,9 +3814,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3728,7 +3843,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3736,9 +3851,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3764,7 +3880,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3772,9 +3888,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3800,7 +3917,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3808,9 +3925,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3836,7 +3954,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3844,9 +3962,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3872,7 +3991,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3880,9 +3999,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3908,7 +4028,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3916,9 +4036,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3944,7 +4065,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3952,9 +4073,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -3980,7 +4102,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -3988,9 +4110,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4016,7 +4139,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2017", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2017", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4024,9 +4147,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4052,7 +4176,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4060,9 +4184,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4088,7 +4213,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4096,9 +4221,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4124,7 +4250,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4132,9 +4258,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4160,7 +4287,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4168,9 +4295,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4196,7 +4324,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4204,9 +4332,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4232,7 +4361,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4240,9 +4369,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4268,7 +4398,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4276,9 +4406,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4304,7 +4435,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4312,9 +4443,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4340,7 +4472,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4348,9 +4480,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4376,7 +4509,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4384,9 +4517,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4412,7 +4546,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4420,9 +4554,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4448,7 +4583,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4456,9 +4591,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4484,7 +4620,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4492,9 +4628,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4520,7 +4657,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4528,9 +4665,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4556,7 +4694,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4564,9 +4702,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4592,7 +4731,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4600,9 +4739,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4628,7 +4768,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4636,9 +4776,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4664,7 +4805,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4672,9 +4813,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4700,7 +4842,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4708,9 +4850,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4736,7 +4879,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4744,9 +4887,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4772,7 +4916,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4780,9 +4924,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4808,7 +4953,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4816,9 +4961,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4844,7 +4990,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4852,9 +4998,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4880,7 +5027,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4888,9 +5035,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4916,7 +5064,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4924,9 +5072,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4952,7 +5101,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4960,9 +5109,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -4988,7 +5138,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -4996,9 +5146,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5024,7 +5175,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5032,9 +5183,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5060,7 +5212,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5068,9 +5220,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5096,7 +5249,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5104,9 +5257,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5132,7 +5286,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5140,9 +5294,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5168,7 +5323,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5176,9 +5331,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5204,7 +5360,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5212,9 +5368,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5240,7 +5397,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5248,9 +5405,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5276,7 +5434,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5284,9 +5442,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5312,7 +5471,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5320,9 +5479,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5348,7 +5508,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5356,9 +5516,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5384,7 +5545,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5392,9 +5553,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5420,7 +5582,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5428,9 +5590,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5456,7 +5619,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5464,9 +5627,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5492,7 +5656,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5500,9 +5664,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5528,7 +5693,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5536,9 +5701,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5564,7 +5730,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5572,9 +5738,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5600,7 +5767,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5608,9 +5775,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5636,7 +5804,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5644,9 +5812,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5672,7 +5841,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5680,9 +5849,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5708,7 +5878,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5716,9 +5886,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5744,7 +5915,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5752,9 +5923,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5780,7 +5952,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5788,9 +5960,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5816,7 +5989,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5824,9 +5997,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5852,7 +6026,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5860,9 +6034,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5888,7 +6063,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5896,9 +6071,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5924,7 +6100,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5932,9 +6108,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5960,7 +6137,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -5968,9 +6145,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -5996,7 +6174,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6004,9 +6182,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6032,7 +6211,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6040,9 +6219,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6068,7 +6248,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6076,9 +6256,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6104,7 +6285,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6112,9 +6293,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6140,7 +6322,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6148,9 +6330,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6176,7 +6359,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6184,9 +6367,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6212,7 +6396,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6220,9 +6404,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6248,7 +6433,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2018", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2018", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6256,9 +6441,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6284,7 +6470,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6292,9 +6478,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6320,7 +6507,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6328,9 +6515,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6356,7 +6544,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6364,9 +6552,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6392,7 +6581,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6400,9 +6589,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6428,7 +6618,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6436,9 +6626,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6464,7 +6655,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6472,9 +6663,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6500,7 +6692,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6508,9 +6700,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6536,7 +6729,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6544,9 +6737,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6572,7 +6766,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6580,9 +6774,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6608,7 +6803,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6616,9 +6811,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6644,7 +6840,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6652,9 +6848,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6680,7 +6877,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6688,9 +6885,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6716,7 +6914,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6724,9 +6922,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6752,7 +6951,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6760,9 +6959,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6788,7 +6988,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6796,9 +6996,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6824,7 +7025,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6832,9 +7033,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6860,7 +7062,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6868,9 +7070,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6896,7 +7099,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6904,9 +7107,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6932,7 +7136,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6940,9 +7144,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -6968,7 +7173,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -6976,9 +7181,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7004,7 +7210,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7012,9 +7218,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7040,7 +7247,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7048,9 +7255,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7076,7 +7284,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7084,9 +7292,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7112,7 +7321,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7120,9 +7329,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7148,7 +7358,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7156,9 +7366,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7184,7 +7395,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7192,9 +7403,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7220,7 +7432,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7228,9 +7440,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7256,7 +7469,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7264,9 +7477,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7292,7 +7506,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7300,9 +7514,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7328,7 +7543,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7336,9 +7551,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7364,7 +7580,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7372,9 +7588,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7400,7 +7617,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7408,9 +7625,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7436,7 +7654,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7444,9 +7662,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7472,7 +7691,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7480,9 +7699,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7508,7 +7728,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7516,9 +7736,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7544,7 +7765,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7552,9 +7773,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7580,7 +7802,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7588,9 +7810,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7616,7 +7839,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7624,9 +7847,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7652,7 +7876,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7660,9 +7884,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7688,7 +7913,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7696,9 +7921,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7724,7 +7950,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7732,9 +7958,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7760,7 +7987,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7768,9 +7995,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7796,7 +8024,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7804,9 +8032,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7832,7 +8061,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7840,9 +8069,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7868,7 +8098,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7876,9 +8106,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7904,7 +8135,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7912,9 +8143,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7940,7 +8172,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7948,9 +8180,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -7976,7 +8209,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -7984,9 +8217,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8012,7 +8246,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8020,9 +8254,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8048,7 +8283,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8056,9 +8291,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8084,7 +8320,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8092,9 +8328,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8120,7 +8357,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8128,9 +8365,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8156,7 +8394,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8164,9 +8402,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8192,7 +8431,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8200,9 +8439,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8228,7 +8468,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8236,9 +8476,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8264,7 +8505,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8272,9 +8513,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8300,7 +8542,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8308,9 +8550,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8336,7 +8579,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8344,9 +8587,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8372,7 +8616,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8380,9 +8624,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8408,7 +8653,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8416,9 +8661,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8444,7 +8690,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8452,9 +8698,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8480,7 +8727,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2019", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2019", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8488,9 +8735,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8516,7 +8764,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8524,9 +8772,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8552,7 +8801,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8560,9 +8809,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8588,7 +8838,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8596,9 +8846,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8624,7 +8875,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8632,9 +8883,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8660,7 +8912,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8668,9 +8920,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8696,7 +8949,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8704,9 +8957,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8732,7 +8986,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8740,9 +8994,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8768,7 +9023,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8776,9 +9031,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8804,7 +9060,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8812,9 +9068,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8840,7 +9097,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8848,9 +9105,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8876,7 +9134,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8884,9 +9142,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8912,7 +9171,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8920,9 +9179,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8948,7 +9208,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8956,9 +9216,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -8984,7 +9245,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -8992,9 +9253,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9020,7 +9282,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9028,9 +9290,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9056,7 +9319,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9064,9 +9327,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9092,7 +9356,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9100,9 +9364,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9128,7 +9393,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9136,9 +9401,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9164,7 +9430,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9172,9 +9438,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9200,7 +9467,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9208,9 +9475,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9236,7 +9504,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9244,9 +9512,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9272,7 +9541,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9280,9 +9549,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9308,7 +9578,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9316,9 +9586,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9344,7 +9615,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9352,9 +9623,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9380,7 +9652,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9388,9 +9660,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9416,7 +9689,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9424,9 +9697,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9452,7 +9726,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9460,9 +9734,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9488,7 +9763,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9496,9 +9771,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9524,7 +9800,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9532,9 +9808,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9560,7 +9837,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9568,9 +9845,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9596,7 +9874,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9604,9 +9882,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9632,7 +9911,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9640,9 +9919,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9668,7 +9948,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9676,9 +9956,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9704,7 +9985,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9712,9 +9993,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9740,7 +10022,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9748,9 +10030,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9776,7 +10059,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9784,9 +10067,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9812,7 +10096,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9820,9 +10104,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9848,7 +10133,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9856,9 +10141,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9884,7 +10170,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9892,9 +10178,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9920,7 +10207,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9928,9 +10215,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9956,7 +10244,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -9964,9 +10252,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -9992,7 +10281,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10000,9 +10289,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10028,7 +10318,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10036,9 +10326,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10064,7 +10355,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10072,9 +10363,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10100,7 +10392,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10108,9 +10400,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10136,7 +10429,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10144,9 +10437,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10172,7 +10466,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10180,9 +10474,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10208,7 +10503,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10216,9 +10511,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10244,7 +10540,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10252,9 +10548,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10280,7 +10577,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10288,9 +10585,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10316,7 +10614,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10324,9 +10622,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10352,7 +10651,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10360,9 +10659,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10388,7 +10688,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10396,9 +10696,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10424,7 +10725,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10432,9 +10733,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10460,7 +10762,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10468,9 +10770,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10496,7 +10799,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10504,9 +10807,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10532,7 +10836,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10540,9 +10844,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10568,7 +10873,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10576,9 +10881,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10604,7 +10910,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10612,9 +10918,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10640,7 +10947,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10648,9 +10955,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10676,7 +10984,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10684,9 +10992,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10712,7 +11021,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2020", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2020", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10720,9 +11029,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10748,7 +11058,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10756,9 +11066,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10784,7 +11095,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10792,9 +11103,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10820,7 +11132,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10828,9 +11140,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10856,7 +11169,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10864,9 +11177,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10892,7 +11206,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10900,9 +11214,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10928,7 +11243,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10936,9 +11251,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -10964,7 +11280,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -10972,9 +11288,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11000,7 +11317,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11008,9 +11325,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11036,7 +11354,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11044,9 +11362,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11072,7 +11391,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11080,9 +11399,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11108,7 +11428,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11116,9 +11436,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11144,7 +11465,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11152,9 +11473,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11180,7 +11502,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11188,9 +11510,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11216,7 +11539,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11224,9 +11547,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11252,7 +11576,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11260,9 +11584,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11288,7 +11613,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11296,9 +11621,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11324,7 +11650,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11332,9 +11658,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11360,7 +11687,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11368,9 +11695,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11396,7 +11724,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11404,9 +11732,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11432,7 +11761,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11440,9 +11769,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11468,7 +11798,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11476,9 +11806,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11504,7 +11835,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11512,9 +11843,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11540,7 +11872,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11548,9 +11880,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11576,7 +11909,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11584,9 +11917,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11612,7 +11946,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11620,9 +11954,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11648,7 +11983,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11656,9 +11991,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11684,7 +12020,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11692,9 +12028,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11720,7 +12057,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11728,9 +12065,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11756,7 +12094,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11764,9 +12102,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11792,7 +12131,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11800,9 +12139,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11828,7 +12168,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11836,9 +12176,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11864,7 +12205,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11872,9 +12213,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11900,7 +12242,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11908,9 +12250,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11936,7 +12279,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11944,9 +12287,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -11972,7 +12316,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -11980,9 +12324,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12008,7 +12353,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12016,9 +12361,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12044,7 +12390,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12052,9 +12398,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12080,7 +12427,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12088,9 +12435,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12116,7 +12464,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12124,9 +12472,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12152,7 +12501,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12160,9 +12509,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12188,7 +12538,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12196,9 +12546,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12224,7 +12575,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12232,9 +12583,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12260,7 +12612,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12268,9 +12620,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12296,7 +12649,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12304,9 +12657,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12332,7 +12686,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12340,9 +12694,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12368,7 +12723,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12376,9 +12731,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12404,7 +12760,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12412,9 +12768,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12440,7 +12797,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12448,9 +12805,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12476,7 +12834,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12484,9 +12842,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12512,7 +12871,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12520,9 +12879,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12548,7 +12908,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12556,9 +12916,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12584,7 +12945,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12592,9 +12953,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12620,7 +12982,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12628,9 +12990,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12656,7 +13019,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12664,9 +13027,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12692,7 +13056,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12700,9 +13064,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12728,7 +13093,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12736,9 +13101,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12764,7 +13130,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12772,9 +13138,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12800,7 +13167,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12808,9 +13175,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12836,7 +13204,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12844,9 +13212,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12872,7 +13241,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12880,9 +13249,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12908,7 +13278,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12916,9 +13286,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12944,7 +13315,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2021", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2021", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12952,9 +13323,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -12980,7 +13352,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -12988,9 +13360,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13016,7 +13389,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13024,9 +13397,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13052,7 +13426,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13060,9 +13434,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13088,7 +13463,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13096,9 +13471,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13124,7 +13500,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13132,9 +13508,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13160,7 +13537,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13168,9 +13545,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13196,7 +13574,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13204,9 +13582,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13232,7 +13611,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13240,9 +13619,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13268,7 +13648,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13276,9 +13656,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13304,7 +13685,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13312,9 +13693,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13340,7 +13722,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13348,9 +13730,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13376,7 +13759,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13384,9 +13767,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13412,7 +13796,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13420,9 +13804,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13448,7 +13833,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13456,9 +13841,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13484,7 +13870,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13492,9 +13878,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13520,7 +13907,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13528,9 +13915,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13556,7 +13944,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13564,9 +13952,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13592,7 +13981,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13600,9 +13989,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13628,7 +14018,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13636,9 +14026,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13664,7 +14055,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13672,9 +14063,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13700,7 +14092,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13708,9 +14100,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13736,7 +14129,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13744,9 +14137,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13772,7 +14166,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13780,9 +14174,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13808,7 +14203,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13816,9 +14211,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13844,7 +14240,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13852,9 +14248,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13880,7 +14277,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13888,9 +14285,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13916,7 +14314,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13924,9 +14322,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13952,7 +14351,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13960,9 +14359,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -13988,7 +14388,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -13996,9 +14396,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14024,7 +14425,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14032,9 +14433,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14060,7 +14462,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14068,9 +14470,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14096,7 +14499,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14104,9 +14507,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14132,7 +14536,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14140,9 +14544,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14168,7 +14573,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14176,9 +14581,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14204,7 +14610,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14212,9 +14618,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14240,7 +14647,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14248,9 +14655,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14276,7 +14684,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14284,9 +14692,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14312,7 +14721,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14320,9 +14729,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14348,7 +14758,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14356,9 +14766,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14384,7 +14795,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14392,9 +14803,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14420,7 +14832,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14428,9 +14840,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14456,7 +14869,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14464,9 +14877,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14492,7 +14906,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14500,9 +14914,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14528,7 +14943,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14536,9 +14951,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14564,7 +14980,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14572,9 +14988,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14600,7 +15017,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14608,9 +15025,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14636,7 +15054,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14644,9 +15062,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14672,7 +15091,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14680,9 +15099,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14708,7 +15128,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14716,9 +15136,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14744,7 +15165,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14752,9 +15173,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14780,7 +15202,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14788,9 +15210,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14816,7 +15239,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14824,9 +15247,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14852,7 +15276,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14860,9 +15284,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14888,7 +15313,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14896,9 +15321,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14924,7 +15350,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14932,9 +15358,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14960,7 +15387,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -14968,9 +15395,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -14996,7 +15424,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15004,9 +15432,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15032,7 +15461,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15040,9 +15469,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15068,7 +15498,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15076,9 +15506,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15104,7 +15535,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15112,9 +15543,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15140,7 +15572,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15148,9 +15580,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15176,7 +15609,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2022", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2022", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15184,9 +15617,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15212,7 +15646,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15220,9 +15654,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15248,7 +15683,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15256,9 +15691,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15284,7 +15720,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15292,9 +15728,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15320,7 +15757,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15328,9 +15765,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15356,7 +15794,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15364,9 +15802,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15392,7 +15831,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15400,9 +15839,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15428,7 +15868,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15436,9 +15876,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15464,7 +15905,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15472,9 +15913,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15500,7 +15942,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15508,9 +15950,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15536,7 +15979,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15544,9 +15987,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15572,7 +16016,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15580,9 +16024,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15608,7 +16053,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15616,9 +16061,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15644,7 +16090,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15652,9 +16098,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15680,7 +16127,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15688,9 +16135,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15716,7 +16164,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15724,9 +16172,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15752,7 +16201,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15760,9 +16209,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15788,7 +16238,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15796,9 +16246,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15824,7 +16275,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15832,9 +16283,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15860,7 +16312,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15868,9 +16320,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15896,7 +16349,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15904,9 +16357,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15932,7 +16386,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15940,9 +16394,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -15968,7 +16423,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -15976,9 +16431,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16004,7 +16460,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16012,9 +16468,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16040,7 +16497,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16048,9 +16505,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16076,7 +16534,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16084,9 +16542,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16112,7 +16571,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16120,9 +16579,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16148,7 +16608,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16156,9 +16616,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16184,7 +16645,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16192,9 +16653,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16220,7 +16682,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16228,9 +16690,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16256,7 +16719,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16264,9 +16727,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16292,7 +16756,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16300,9 +16764,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16328,7 +16793,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16336,9 +16801,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16364,7 +16830,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16372,9 +16838,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16400,7 +16867,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16408,9 +16875,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16436,7 +16904,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16444,9 +16912,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16472,7 +16941,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16480,9 +16949,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16508,7 +16978,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16516,9 +16986,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16544,7 +17015,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16552,9 +17023,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16580,7 +17052,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16588,9 +17060,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16616,7 +17089,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16624,9 +17097,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16652,7 +17126,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16660,9 +17134,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16688,7 +17163,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16696,9 +17171,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16724,7 +17200,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16732,9 +17208,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16760,7 +17237,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16768,9 +17245,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16796,7 +17274,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16804,9 +17282,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16832,7 +17311,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16840,9 +17319,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16868,7 +17348,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16876,9 +17356,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16904,7 +17385,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16912,9 +17393,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16940,7 +17422,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16948,9 +17430,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -16976,7 +17459,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -16984,9 +17467,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17012,7 +17496,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17020,9 +17504,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17048,7 +17533,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17056,9 +17541,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17084,7 +17570,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17092,9 +17578,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17120,7 +17607,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17128,9 +17615,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17156,7 +17644,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17164,9 +17652,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17192,7 +17681,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17200,9 +17689,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17228,7 +17718,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17236,9 +17726,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17264,7 +17755,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17272,9 +17763,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17300,7 +17792,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17308,9 +17800,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17336,7 +17829,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17344,9 +17837,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17372,7 +17866,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17380,9 +17874,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17408,7 +17903,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2023", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2023", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17416,9 +17911,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17439,7 +17935,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/en/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17447,9 +17943,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__en\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17470,7 +17967,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/es/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17478,9 +17975,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__es\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__es[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17501,7 +17999,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ru/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17509,9 +18007,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ru\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ru[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17532,7 +18031,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/it/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17540,9 +18039,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__it\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__it[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17563,7 +18063,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/de/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17571,9 +18071,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__de\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__de[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17594,7 +18095,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fr/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17602,9 +18103,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17625,7 +18127,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ar/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17633,9 +18135,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ar\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ar[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17656,7 +18159,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hi/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17664,9 +18167,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17687,7 +18191,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pt/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17695,9 +18199,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17718,7 +18223,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tr/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17726,9 +18231,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17749,7 +18255,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/zh/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17757,9 +18263,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__zh\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__zh[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17780,7 +18287,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/el/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17788,9 +18295,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__el\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__el[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17811,7 +18319,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ja/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17819,9 +18327,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ja\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ja[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17842,7 +18351,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ro/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17850,9 +18359,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ro\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ro[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17873,7 +18383,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nl/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17881,9 +18391,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17904,7 +18415,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ko/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17912,9 +18423,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ko\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ko[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17935,7 +18447,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/id/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17943,9 +18455,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__id\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__id[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17966,7 +18479,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/uk/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -17974,9 +18487,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__uk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__uk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -17997,7 +18511,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/vi/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18005,9 +18519,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__vi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__vi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18028,7 +18543,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/pl/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18036,9 +18551,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__pl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__pl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18059,7 +18575,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fa/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18067,9 +18583,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fa\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fa[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18090,7 +18607,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sv/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18098,9 +18615,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18121,7 +18639,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ta/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18129,9 +18647,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ta\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ta[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18152,7 +18671,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bg/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18160,9 +18679,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bg\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bg[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18183,7 +18703,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hu/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18191,9 +18711,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18214,7 +18735,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/cs/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18222,9 +18743,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__cs\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__cs[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18245,7 +18767,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ur/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18253,9 +18775,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ur\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ur[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18276,7 +18799,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/no/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18284,9 +18807,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__no\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__no[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18307,7 +18831,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/fi/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18315,9 +18839,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__fi\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__fi[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18338,7 +18863,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/bn/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18346,9 +18871,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__bn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__bn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18369,7 +18895,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/et/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18377,9 +18903,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__et\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__et[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18400,7 +18927,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mr/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18408,9 +18935,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18431,7 +18959,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hr/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18439,9 +18967,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18462,7 +18991,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ml/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18470,9 +18999,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ml\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ml[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18493,7 +19023,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ca/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18501,9 +19031,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ca\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ca[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18524,7 +19055,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/te/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18532,9 +19063,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__te\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__te[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18555,7 +19087,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/da/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18563,9 +19095,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__da\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__da[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18586,7 +19119,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sl/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18594,9 +19127,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18617,7 +19151,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sk/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18625,9 +19159,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18648,7 +19183,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sq/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18656,9 +19191,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sq\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sq[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18679,7 +19215,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/sr/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18687,9 +19223,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__sr\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__sr[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18710,7 +19247,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/az/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18718,9 +19255,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__az\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__az[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18741,7 +19279,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/he/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18749,9 +19287,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__he\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__he[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18772,7 +19311,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/th/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18780,9 +19319,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__th\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__th[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18803,7 +19343,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lt/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18811,9 +19351,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lt\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lt[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18834,7 +19375,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kn/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18842,9 +19383,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18865,7 +19407,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/lv/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18873,9 +19415,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__lv\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__lv[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18896,7 +19439,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/mk/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18904,9 +19447,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__mk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__mk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18927,7 +19471,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/hy/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18935,9 +19479,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__hy\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__hy[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18958,7 +19503,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/is/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18966,9 +19511,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__is\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__is[0][-1] < 0.5)" ], "syntax": "jq" } @@ -18989,7 +19535,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/kk/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -18997,9 +19543,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__kk\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__kk[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19020,7 +19567,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gu/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19028,9 +19575,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gu[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19051,7 +19599,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ka/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19059,9 +19607,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ka\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ka[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19082,7 +19631,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/nn/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19090,9 +19639,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__nn\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__nn[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19113,7 +19663,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ne/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19121,9 +19671,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ne\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ne[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19144,7 +19695,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/gl/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19152,9 +19703,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__gl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__gl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19175,7 +19727,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ckb/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19183,9 +19735,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ckb\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ckb[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19206,7 +19759,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ky/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19214,9 +19767,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ky\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ky[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19237,7 +19791,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/tl/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19245,9 +19799,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__tl\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__tl[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19268,7 +19823,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/ug/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19276,9 +19831,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__ug\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__ug[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19299,7 +19855,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/be/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19307,9 +19863,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__be\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__be[0][-1] < 0.5)" ], "syntax": "jq" } @@ -19330,7 +19887,7 @@ "output": "zst" }, "output": { - "path": "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2024", + "path": "s3://ai2-llm/pretraining-data/sources/cc-news/v2-resiliparse-l100k/documents/eu/2024", "max_size_in_bytes": 10000000000 }, "attributes": [ @@ -19338,9 +19895,10 @@ "dolma_v2_tokenizer" ], "filter": { - "include": [ - ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100", - "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu != null) and (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] >= 0.5) and ((.attributes | to_entries | map(select(.key | startswith(\"ft_lang_id_1e2__ft_lang_id_1e2__\"))) | max_by(.value) | .key ) == \"ft_lang_id_1e2__ft_lang_id_1e2__eu\")" + "include": [], + "exclude": [ + ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] <= 100", + "(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu == null) or (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__eu[0][-1] < 0.5)" ], "syntax": "jq" }