-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathslurmibtopology.sh
executable file
·192 lines (169 loc) · 5.51 KB
/
slurmibtopology.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env bash
# Discover Infiniband network and print a Slurm topology.conf file
# Author: [email protected]
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/
VERSION="slurmibtopology.sh version 0.22. Date: 09-May-2017"
# CONFIGURE the paths to the commands below:
# Directories where commands live:
sprefix=/usr/sbin
prefix=/usr/bin
# Commands used from the infiniband-diags RPM:
IBNETDISCOVER=$sprefix/ibnetdiscover
IBSTAT=$sprefix/ibstat
# Slurm command for printing sorted hostlists
export MY_SCONTROL=$prefix/scontrol
# GNU Awk (gawk version 4 is better, but gawk version 3 should work)
MY_AWK=$prefix/gawk
# Command usage:
function usage()
{
cat <<EOF
Usage: slurmibtopology.sh [-c]
where:
-c: comments in the output will be filtered
-V: Version information
-h: Print this help information
EOF
}
# Filtering comment lines from the output:
MY_FILTER="cat"
while getopts "cVh" options; do
case $options in
c ) export MY_FILTER="grep -v ^#"
;;
V ) echo $VERSION
exit 1;;
h|? ) usage
exit 1;;
* ) usage
exit 1;;
esac
done
# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
echo ERROR: Too many command line arguments: $*
usage
exit 1
fi
if test ! -x $IBNETDISCOVER -o ! -x $IBSTAT
then
echo Error: Command $IBNETDISCOVER not found
echo Please install the RPM package infiniband-diags
exit -1
fi
if test ! -x $MY_SCONTROL
then
echo "Notice: Command $MY_SCONTROL not found (for sorting hostlists)"
export MY_SCONTROL=""
fi
echo Verify the Infiniband interface:
if $IBSTAT -l
then
echo Infiniband interface OK
else
echo Infiniband interface NOT OK
exit -1
fi
cat <<EOF
Generate the Slurm topology.conf file for Infiniband switches.
Beware: The Switches= lines need to be reviewed and edited for correctness.
Read also https://slurm.schedmd.com/topology.html
EOF
# Discover IB switches (-S) and ports (-p):
$IBNETDISCOVER -S -p | $MY_AWK '
BEGIN {
# Read the required environment variables:
scontrol=ENVIRON["MY_SCONTROL"]
}
# Define a hostname collapse function:
function collapse_list(list)
{
if (scontrol == "") {
return list # No scontrol command: collapse cannot be done
} else {
# Collapse the list: Slurm command for sorting hostlists nicely
cmd = scontrol " show hostlistsorted " list
cmd | getline sortedlist
close (cmd)
return sortedlist
}
}
$1 == "SW" {
guid = $4 # Switch GUID
linkwidth = $5
split($0,comment,"\047") # Split line at single-quotes (octal \047) to get comment fields
swdesc = comment[2] # First field in '' is the switch node description
nodedescription = comment[4] # Second field in '' is the neighbor node description
if (nswitch == 0 || switchguid[nswitch] != guid) {
nswitch++ # A new switch
switchguid[nswitch] = guid # We have to identify switches by their GUID
switchnum[guid] = nswitch # Index switches by GUID
switchdescription[nswitch] = swdesc # Switch description
SEP = "" # Hostlist separator
}
if (linkwidth == "??") next # Skip inactive link
neighbortype = $8
neighborguid = $11
if (neighbortype == "CA") { # Host link "CA" (HCA)
linkcount[nswitch,neighbortype]++
split(nodedescription, desc, " ")
hostname = desc[1] # First item in nodedescription should be the hostname
neighborlist[nswitch,hostname]++ # Count number of links to this hostname
if (neighborlist[nswitch,hostname] == 1) {
# Append hostname to list (only once per host in case it has multiple links)
hostlist[nswitch] = hostlist[nswitch] SEP hostname
SEP = "," # Hostlist separator
}
} else if (neighbortype == "SW") { # Switch link "SW"
linkcount[nswitch,neighborguid]++
switchneighbor[nswitch,neighborguid] = neighborguid
}
} END {
# Select a switch name prefix, initialize switchname list
switchprefix="ibsw"
for (i=1; i<=nswitch; i++) {
switchname[i] = switchprefix sprintf("%d", i)
}
# Loop over switches
for (i=1; i<=nswitch; i++) {
HSEP = "" # Hostlist separator
printf("#\n# IB switch no. %d: %s GUID: %s Description: %s\n#\n", i, switchname[i], switchguid[i], switchdescription[i])
totallinks = 0
# With GAWK v4 the loop is simply: for (l in linkcount[i]) {
# With GAWK v3 you need to manipulate indices yourself:
# Print the switch-to-switch link list:
for (ind in linkcount) {
split (ind, t, SUBSEP)
if (t[1] != i) continue
l = t[2]
totallinks += linkcount[i,l]
print "# Switch neighbor ", l, " with " linkcount[i,l] " links"
}
print "# Total number of links in this switch = ", totallinks
# Notice: Slurm topology.conf SwitchName lines can either contain Nodes= OR Switches=
# See https://slurm.schedmd.com/topology.conf.html
if (hostlist[i] == "") {
# A top-level switch with no leaf compute nodes
print "# NOTICE: This switch " switchname[i] " has no attached nodes (empty hostlist)"
# Gather list of switches with links to this switch
switchlist = ""
SWSEP = "" # Switch list separator
for (ind in switchneighbor) {
split (ind, t, SUBSEP)
if (t[1] != i) continue
n = t[2]
neighborguid = switchneighbor[i,n]
switchlist = switchlist SWSEP switchname[switchnum[neighborguid]]
SWSEP = "," # Switch list separator
}
printf("SwitchName=%s Switches=%s\n", switchname[i], collapse_list(switchlist))
} else {
printf("SwitchName=%s Nodes=%s\n", switchname[i], collapse_list(hostlist[i]))
HSEP = "," # Hostlist separator
allswitches = allswitches "," switchname[i]
}
}
# printf("#\n# Merging all switches in a top-level spine switch\n#\n")
# print "SwitchName=spineswitch Switches=" collapse_list(allswitches)
}' | $MY_FILTER