-
Notifications
You must be signed in to change notification settings - Fork 0
/
monitoring_script.sh
executable file
·149 lines (141 loc) · 7.85 KB
/
monitoring_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: --hftoken is not present in environment, not sending jobs."
exit 1
fi
echo "HF token value was set, proceeding to send jobs."
if [[ -z "$CONDA_DEFAULT_ENV" ]]; then
echo "Conda environment is not active, proceeding to send jobs."
else
echo "Conda environment was activated and it would've caused jobs to fail after allocation. Not sending jobs."
exit 1
fi
test=$1
warm_restart=$2
SECONDS=0
start="2024-08-13T17:15:00"
interval=7200
completed=0
total=0
failed_jobs=0
repeats=0
layer=1
seed=0
parent_fold=0
start_fold=$(( parent_fold ))
end_fold=$(( parent_fold + 1 ))
extract_params_from_name() {
# example name=ft_l1_s40_f0_strat
name=$1
read layer seed parent_fold <<< $( echo $name | awk -F '[_lfs]' '{print $4, $6, $8}' )
}
run_failed () {
total=$( grep strat $1 | awk -F '|' '{print $2}' | sort -u | wc -l )
echo $(date) >> monitoring_email.txt
echo "Running failed jobs analysis as stored in file $1. Total jobs in it are $total" >> monitoring_email.txt
echo "======================================================================================" >> monitoring_email.txt
jobnames=$( grep 'strat' $1 | awk -F '|' '{print $2}' | sort -u )
failed_jobs=0
completed=0
repeats=0
for jobname in $jobnames; do
extract_params_from_name $jobname
# if the .0 or .1 have failed status -> rerun the .0 or .1 but with half the time, so -t=15:00:00
jobid=$(grep $jobname $1 | tail -n 1 | awk -F '|' '{print $1}')
check_0=$(grep ${jobid}.0 $1 | grep -i 'fail' | wc -l)
check_1=$(grep ${jobid}.1 $1 | grep -i 'fail' | wc -l)
# handle splitting of jobs
# f0 might've had f1 fail but then monitor started f1, so exclude the new running f1 from fail count
# for the scenario of f0 having .0 fail new job with name f0 will start and the tail -n 1 in jobname will handle it.
# cancel removed in txt, running, pending, complete, timeout or fail remaining.
split=$(( parent_fold + 1 ))
not_fail_1=$(grep ft_l${layer}_s${seed}_f${split}_strat $1 | tail -n 1 | grep -ivE 'fail|timeout' | wc -l)
check_1=$(( $check_1 - $not_fail_1 ))
if [ "$check_0" == "1" ]; then
start_fold=$(( parent_fold ))
end_fold=$(( parent_fold ))
job_details=$(grep ${jobid} $1)
echo "$job_details" >> monitoring_email.txt
echo "sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1" >> monitoring_email.txt
sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1 | cat >> monitoring_email.txt
echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
fi
if [ "$check_1" == "1" ]; then
start_fold=$(( parent_fold + 1 ))
end_fold=$(( parent_fold + 1 ))
job_details=$(grep ${jobid} $1)
echo "$job_details" >> monitoring_email.txt
echo "sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1" >> monitoring_email.txt
sbatch -J ft_l${layer}_s${seed}_f${start_fold}_strat -t 15:00:00 -A carney-tserre-condo finetune_provpath.sh --data TCGA --seed $seed --featlayer ${layer} --start_fold ${start_fold} --end_fold ${end_fold} --pat_strat 1 --test_strat 1 --val_strat 1 --train_strat 1 | cat >> monitoring_email.txt
echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
fi
if [[ "$check_0" == "1" || "$check_1" == "1" ]]; then
failed_jobs=$(( failed_jobs + 1 ))
else
chck_complete=$( grep "strat" $1 | grep $jobid | grep -i "complete" | wc -l )
if [ "$chck_complete" = "1" ]; then
completed=$(( completed + 1 ))
fi
fi
# check for repeats
job_count=$( grep $jobname $1 | grep -iE 'running|pending' | wc -l )
if [ "$job_count" -ge 2 ]; then
echo "Repeatition in job: $jobname" >> monitoring_email.txt
cat $1 | grep $jobname | grep -iE 'running|pending' >> monitoring_email.txt
echo "-----------------------------------------------------------------------------------------------" >> monitoring_email.txt
repeats=$(( repeats + 1 ))
fi
done
}
start=$(date +%Y-%m-%dT%H:%M:%S)
# warm start monitoring based on all jobs from 13th August 17:15
if [ "$warm_restart" == "1" ]; then
start="2024-08-13T17:15:00" # earlier timestamp from when all jobs had 2 folds
# start=2024-08-17T13:49:30 # the time monitoring job was interrupted because of inaccurate failed & complete counts along with grep errors in the logs.
fi
sacct -PS $start -o "jobid,jobname,submit,state,elapsed,start,end,nodelist,timelimit" | grep -ivE "batch|extern|interact|cancel|NodeList" | sort -t "|" -k7,7 -k6,6 > all_jobs.txt
sleep 1
run_failed all_jobs.txt
echo "Starting from old cache (date $start). Failed jobs were: $failed_jobs , completed are $completed"
# poll and update email details
while true; do
if (( SECONDS % interval == 0 )) || [ "$test" = "1" ]; then
default="Subject: Jobs: Running {}, Launched or Fail {}, Complete {}, Pending {}, Repeats {}. Memory: Scratch {}, Home {}. Monitoring Dur: {}"
# time
hours=$(( SECONDS / 3600 ))
sed -i "s/Monitoring Dur: {}/Monitoring Dur: ${hours}h/" monitoring_email.txt
# memory
memory_cmd=$(checkquota | head -3 | awk -F ' ' 'NR>1 { print $3, $4 }')
home_mem=$(echo $memory_cmd | cut -d ' ' -f 1)
home_mem_per=$(echo $memory_cmd | cut -d ' ' -f 2)
scratch_mem=$(echo $memory_cmd | cut -d ' ' -f 3)
scratch_mem_per=$(echo $memory_cmd | cut -d ' ' -f 4)
sed -i "s/Scratch {}, Home {}/Scratch ${scratch_mem}G ${scratch_mem_per}%, Home ${home_mem}G ${home_mem_per}%/" monitoring_email.txt
# jobs
# store the job details since the start of the script sorted by (end,start) time. Unknown/pending jobs are at the end.
echo "Polling job status since $start"
sacct -PS $start -o "jobid,jobname,submit,state,elapsed,start,end,nodelist,timelimit" | grep -ivE "batch|extern|interact|cancel" | sort -t "|" -k7,7 -k6,6 > all_jobs.txt
sleep 1
# update completed and failed jobs + launch new jobs
run_failed all_jobs.txt
running=$(myq | grep gpu2 | wc -l)
pending=$(myq | grep "N/A" | wc -l)
sed -i "s/Running {}/Running $running/" monitoring_email.txt
sed -i "s/Pending {}/Pending $pending/" monitoring_email.txt
sed -i "s/Fail {}/Fail ${failed_jobs}/" monitoring_email.txt
sed -i "s/Complete {}/Complete ${completed}/" monitoring_email.txt
sed -i "s/Repeats {}/Repeats ${repeats}/" monitoring_email.txt
sendmail -v [email protected] < monitoring_email.txt
# reset mail
sed -i "s/^Subject:.*/${default}/" monitoring_email.txt
head -n 6 monitoring_email.txt > temp.txt && mv temp.txt monitoring_email.txt
fi
if [ "$test" = "1" ]; then
exit 0
fi
# monitor for 4 days, 4*24*60*60
if [ "$SECONDS" = "345600" ]; then
echo "Ending monitoring proc, ran for $(( SECONDS / 3600 ))h or $(( SECONDS / 86400 ))days"
exit 0
fi
done