Skip to content

Commit

Permalink
clis
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Oct 29, 2024
1 parent e1d2088 commit 973621f
Show file tree
Hide file tree
Showing 15 changed files with 843 additions and 744 deletions.
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_automath_arxiv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_automath_code.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/code/*/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_automath_web.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/web/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_flan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/documents/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_owm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/proof-pile-2/v0_decontaminated/documents/*/*/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_pes2o.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/*/*/*/*.gz'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_se.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/stackexchange/v0/documents/20240930/*.zst'

NUM_NODES=1
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
46 changes: 46 additions & 0 deletions classifiers/scripts/nvidia-deberta-automath-arxiv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/math-ai_AutoMathText/v0/documents/arxiv/*/*.gz'


NUM_NODES=1
MODEL_NAME="nvidia/quality-classifier-deberta"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=512
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"
Loading

0 comments on commit 973621f

Please sign in to comment.