forked from Azure/msccl-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathallreduce_1step.py
49 lines (41 loc) · 1.86 KB
/
allreduce_1step.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from msccl.language import *
from msccl.topologies import *
from msccl.language.collectives import AllReduce
def allreduce_allpairs(gpus, instances, protocol):
size = gpus
chunksperloop = gpus
topology = fully_connected(size)
collective = AllReduce(size, chunksperloop, True)
with MSCCLProgram("allreduce_pairs", topology, collective, instances, protocol=protocol,
interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
# Each rank sends the nth chunk to the nth rank into scratch space
for rank in range(size):
tb = 0
for nghr in range(size):
if rank != nghr:
c = chunk(rank, Buffer.input, index=0, size=size)
c.copy(nghr, 'scratch', sendtb=nghr, recvtb=rank)
tb += 1
# Each rank performs a local reduction on the nth chunk
# Utilize 8 threadblocks for this reduction for better parallelism
for rank in range(size):
index = 0
tb = 0
for nghr in range(size):
if rank != nghr:
for s in range(size):
c = chunk(rank, Buffer.input, s)
c.reduce(chunk(rank, 'scratch', index), sendtb=s)
index += 1
tb += 1
XML()
Check()
parser = argparse.ArgumentParser()
parser.add_argument('num_gpus', type=int, help ='number of gpus')
parser.add_argument('instances', type=int, help='number of instances')
parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
args = parser.parse_args()
allreduce_allpairs(args.num_gpus, args.instances, args.protocol)