-
Notifications
You must be signed in to change notification settings - Fork 1
/
grid_check
150 lines (129 loc) · 3.75 KB
/
grid_check
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
#
# wait for jobs to finish running, then exit status info
# jobs can be referred to by name or id
#
set -eu
if [ "$#" == "0" ]
then
echo "usage: grid_wait [-Ppoll(3)] [-Ljobidfile] [-Ijobid[,jobid...] [-Ijobid[,jobid...] [-A] [-Ttaillines] [-O]"
exit
fi
MATCH=""
EXPITEMS=0
POLL=3 #default poll interval is three seconds
LISTFILE=""
SKIP="YES"
TAIL="5"
OUTERR="NO"
ACCTSIZES=${ACCTSIZES:-/home/vicker/gridengine/accountfilesizes} #file listing size of acctfile each day
ACCTFILE=${ACCTFILE:-/var/lib/gridengine/blacklace/common/accounting} #file containing exit status info from grid engine
while true
do
arg="$1"
if [ "${arg:0:1}" != '-' ]
then
echo "invalid argument:" $arg
exit 1
fi
case "${arg:1:1}" in
#name of file containing list of jobids
L)
LISTFILE="${arg:2:${#arg}-2}"
for jobid in $(cat ${LISTFILE})
do
MATCH="${MATCH} -e ${jobid}"
EXPITEMS=$(( EXPITEMS + 1))
done
;;
#wait for numbered job(s) to complete eg -I123 -I345
I)
jobidlist="${arg:2:${#arg}-2}"
for jobid in $(echo ${jobidlist} | tr ',' ' ')
do
MATCH="${MATCH} -e ${jobid}"
EXPITEMS=$(( EXPITEMS + 1))
done
;;
#how many lines of error log to print
T)
TAIL="${arg:2:${#arg}-2}"
;;
#look through all completed jobs info, do not skip old jobs info
A)
SKIP="NO"
;;
#dump tail of stdout log file as well as stderr if exit != 0
O)
OUTERR="YES"
;;
#poll running jobs every ${POLL} seconds
P)
POLL="${arg:2:${#arg}-2}"
;;
*)
echo "Unknown option ${arg}"
exit 1
esac
shift
if [ "$#" == "0" ] ; then break ; fi
done
if [ "${MATCH}" == "" ]
then
echo "no jobs specified"
exit
fi
#work out how much old data in accounting file to skip
YESTERDAY=$(date -u -d "yesterday 00:00 " '+%Y-%m-%d') #yesterdays date
if [ "${SKIP}" == "YES" ]
then
FILESIZE=$(cat ${ACCTSIZES} | grep ${YESTERDAY} | head -n 1 | cut -d' ' -f2) #skip old jobs info
else
FILESIZE=0 #search all jobs info
fi
#wait until expected number of items are retrieved from accounting file
while true
do
NITEMS=$(tail -c +${FILESIZE} ${ACCTFILE} | cut -d: -f6 | grep ${MATCH} | wc --lines)
if [ "${EXPITEMS}" == "${NITEMS}" ]
then
break
fi
sleep ${POLL}
done
#report job exit status
DIV='----------------------------------------------------------------------------------------------------------------'
FORMAT="%-23s | % -10s | %-25s | %-9s | %-4s | %-4s | %-7s | %-7s\n"
echo ${DIV}
printf "${FORMAT}" host user name jid fail exit secs maxvmem
echo "${DIV}"
FORMAT="%-23s | % -10s | %-25s | %-9s | %-4s | %-4s | %-7.1e | %-7.1e\n"
RETCODE=0
for line in $(tail -c +${FILESIZE} ${ACCTFILE} | grep ${MATCH} | tr ' ' '_')
do
#check it was a matching jobid not another column
JIDCHECK=$(echo ${line} | cut -d: -f6 | grep ${MATCH} | wc --lines)
if [ "${JIDCHECK}" != "1" ] ; then continue ; fi
DATA=$(echo ${line} | cut --output-delimiter=" " -d: -f2,4,5,6,12,13,14,43)
printf "${FORMAT}" ${DATA}
ERR=$(echo ${line} | cut -d: -f13)
if [ "${ERR}" == "0" -o "${TAIL}" == "0" ] ; then continue ; fi
RETCODE=1
JNAME=$(echo ${line} | cut -d: -f5)
JID=$(echo ${line} | cut -d: -f6)
ERRLOG=$(find . -name ${JNAME}.${JID}.err)
if [ "${ERRLOG}" == "" ]
then
echo ${ERRLOG}
echo ${JNAME}.${JID}.err not found
else
echo ${DIV}
tail -n ${TAIL} ${ERRLOG}
if [ "${OUTERR}" == "YES" ] ; then
echo ${ERRLOG/\.err/.out}
tail -n ${TAIL} ${ERRLOG/\.err/.out}
fi
echo ${DIV}
fi
done
exit ${RETCODE}