monitoring_script.sh

#!/bin/bash

if [[ -z "$HF_TOKEN" ]]; then
    echo "Error: --hftoken is not present in environment, not sending jobs."
    exit 1
fi
echo "HF token value was set, proceeding to send jobs."

if [[ -z "$CONDA_DEFAULT_ENV" ]]; then
    echo "Conda environment is not active, proceeding to send jobs."
else
    echo "Conda environment was activated and it would've caused jobs to fail after allocation. Not sending jobs."
    exit 1
fi

test=$1
warm_restart=$2
SECONDS=0
start="2024-08-13T17:15:00"
interval=7200

completed=0
total=0
failed_jobs=0
repeats=0
layer=1
seed=0
parent_fold=0
start_fold=$(( parent_fold ))
end_fold=$(( parent_fold + 1 ))

extract_params_from_name() {
    # example name=ft_l1_s40_f0_strat
    name=$1
    read layer seed parent_fold <<< $( echo $name | awk -F '[_lfs]' '{print $4, $6, $8}' )
}

run_failed () {
    total=$( grep strat $1 | awk -F '|' '{print $2}' | sort -u | wc -l )
    echo $(date) >> monitoring_email.txt
    echo "Running failed jobs analysis as stored in file $1. Total jobs in it are $total" >> monitoring_email.txt
    echo "======================================================================================" >> monitoring_email.txt
    jobnames=$( grep 'strat' $1 | awk -F '|' '{print $2}' | sort -u )
    failed_jobs=0
    completed=0
    repeats=0
    for jobname in $jobnames; do
        extract_params_from_name $jobname
        # if the .0 or .1 have failed status -> rerun the .0 or .1 but with half the time, so -t=15:00:00
        jobid=$(grep $jobname $1 | tail -n 1 | awk -F '|' '{print $1}')
        check_0=$(grep ${jobid}.0 $1 | grep -i 'fail' | wc -l)
        check_1=$(grep ${jobid}.1 $1 | grep -i 'fail' | wc -l)
        # handle splitting of jobs
        # f0 might've had f1 fail but then monitor started f1, so exclude the new running f1 from fail count
        # for the scenario of f0 having .0 fail new job with name f0 will start and the tail -n 1 in jobname will handle it. 
        # cancel removed in txt, running, pending, complete, timeout or fail remaining.
        split=$(( parent_fold + 1 ))
        not_fail_1=$(grep ft_l${layer}_s${seed}_f${split}_strat $1 | tail -n 1 | grep -ivE 'fail|timeout' | wc -l)
        check_1=$(( $check_1 - $not_fail_1 ))
        if [ "$check_0" == "1" ]; then
            start_fold=$(( parent_fold ))
            end_fold=$(( parent_fold ))
            job_details=$(grep ${jobid} $1)
            echo "$job_details" >> monitoring_email.txt
            echo "sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1" >> monitoring_email.txt
            sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1 | cat >> monitoring_email.txt
            echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
        fi
        if [ "$check_1" == "1" ]; then
            start_fold=$(( parent_fold + 1 ))
            end_fold=$(( parent_fold + 1 ))
            job_details=$(grep ${jobid} $1)
            echo "$job_details" >> monitoring_email.txt
            echo "sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1" >> monitoring_email.txt
            sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1 | cat >> monitoring_email.txt
            echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
        fi
        if [[ "$check_0" == "1" || "$check_1" == "1" ]]; then
            failed_jobs=$(( failed_jobs + 1 ))
        else
            chck_complete=$( grep "strat" $1 | grep $jobid | grep -i "complete" | wc -l )
            if [ "$chck_complete" = "1" ]; then
                completed=$(( completed + 1 ))
            fi
        fi
        # check for repeats
        job_count=$( grep $jobname $1 | grep -iE 'running|pending' | wc -l )
        if [ "$job_count" -ge 2 ]; then
            echo "Repeatition in job: $jobname" >> monitoring_email.txt
            cat $1 | grep $jobname | grep -iE 'running|pending' >> monitoring_email.txt
            echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
            repeats=$(( repeats + 1 ))
        fi
    done
}

start=$(date +%Y-%m-%dT%H:%M:%S)
# warm start monitoring based on all jobs from 13th August 17:15
if [ "$warm_restart" == "1" ]; then
    start="2024-08-13T17:15:00" # earlier timestamp from when all jobs had 2 folds
    # start=2024-08-17T13:49:30 # the time monitoring job was interrupted because of inaccurate failed & complete counts along with grep errors in the logs.
fi
sacct -PS $start -o "jobid,jobname,submit,state,elapsed,start,end,nodelist,timelimit" | grep -ivE "batch|extern|interact|cancel|NodeList" | sort -t "|" -k7,7 -k6,6 > all_jobs.txt
sleep 1
run_failed all_jobs.txt
echo "Starting from old cache (date $start). Failed jobs were: $failed_jobs , completed are $completed"

# poll and update email details
while true; do
    if (( SECONDS % interval == 0 )) || [ "$test" = "1" ]; then
        default="Subject: Jobs: Running {}, Launched or Fail {}, Complete {}, Pending {}, Repeats {}. Memory: Scratch {}, Home {}. Monitoring Dur: {}"
        # time
        hours=$(( SECONDS / 3600 ))
        sed -i "s/Monitoring Dur: {}/Monitoring Dur: ${hours}h/" monitoring_email.txt
        # memory
        memory_cmd=$(checkquota | head -3 | awk -F ' ' 'NR>1 { print $3, $4 }')
        home_mem=$(echo $memory_cmd | cut -d ' ' -f 1)
        home_mem_per=$(echo $memory_cmd | cut -d ' ' -f 2)
        scratch_mem=$(echo $memory_cmd | cut -d ' ' -f 3)
        scratch_mem_per=$(echo $memory_cmd | cut -d ' ' -f 4)
        sed -i "s/Scratch {}, Home {}/Scratch ${scratch_mem}G ${scratch_mem_per}%, Home ${home_mem}G ${home_mem_per}%/" monitoring_email.txt
        # jobs
        # store the job details since the start of the script sorted by (end,start) time. Unknown/pending jobs are at the end.
        echo "Polling job status since $start"
        sacct -PS $start -o "jobid,jobname,submit,state,elapsed,start,end,nodelist,timelimit" | grep -ivE "batch|extern|interact|cancel" | sort -t "|" -k7,7 -k6,6 > all_jobs.txt
        sleep 1
        # update completed and failed jobs + launch new jobs
        run_failed all_jobs.txt
        running=$(myq | grep gpu2 | wc -l)
        pending=$(myq | grep "N/A" | wc -l)
        sed -i "s/Running {}/Running $running/" monitoring_email.txt
        sed -i "s/Pending {}/Pending $pending/" monitoring_email.txt
        sed -i "s/Fail {}/Fail ${failed_jobs}/" monitoring_email.txt
        sed -i "s/Complete {}/Complete ${completed}/" monitoring_email.txt
        sed -i "s/Repeats {}/Repeats ${repeats}/" monitoring_email.txt
        sendmail -v vipul_sharma@brown.edu < monitoring_email.txt
        # reset mail
        sed -i "s/^Subject:.*/${default}/" monitoring_email.txt
        head -n 6 monitoring_email.txt > temp.txt && mv temp.txt monitoring_email.txt
    fi
    if [ "$test" = "1" ]; then
        exit 0
    fi
    # monitor for 4 days, 4*24*60*60
    if [ "$SECONDS" = "345600" ]; then
        echo "Ending monitoring proc, ran for $(( SECONDS / 3600 ))h or $(( SECONDS / 86400 ))days"
        exit 0
    fi
done