Skip to content

Commit

Permalink
fix race with ibv_query_port() on same ibDev
Browse files Browse the repository at this point in the history
  • Loading branch information
bureddy committed Mar 12, 2024
1 parent 40fb400 commit 64975a8
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 4 deletions.
4 changes: 0 additions & 4 deletions src/ib_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,9 +495,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
for (int i = 0; i < comm->base.ndevs; i++) {
ncclIbSendCommDev* commDev = comm->devs + i;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
// Send my QP Info to receiver through the socket. Hope this won't block.
// TODO - I thought I queried this in init?
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));

// Write to the metadata struct via this pointer
ncclIbDevInfo* devInfo = meta.devs + i;
Expand Down Expand Up @@ -711,7 +708,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandl
ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid));
}

Expand Down
1 change: 1 addition & 0 deletions src/p2p_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIb
pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
ncclIbDevs[ncclNIbDevs].device = d;
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
ncclIbDevs[ncclNIbDevs].portNum = port_num;
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
ncclIbDevs[ncclNIbDevs].speed = nccl_p2p_ib_speed(portAttr.active_speed) * nccl_p2p_ib_width(portAttr.active_width);
Expand Down

0 comments on commit 64975a8

Please sign in to comment.