From 31138e9a5ad81e288229dfc70f24af8e8370a1a1 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 12 Mar 2024 01:16:15 +0200 Subject: [PATCH] fix race with ibv_query_port() on same ibDev --- src/ib_plugin.c | 4 ---- src/p2p_plugin.c | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ib_plugin.c b/src/ib_plugin.c index a1e663e2..9a304f17 100644 --- a/src/ib_plugin.c +++ b/src/ib_plugin.c @@ -495,9 +495,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet for (int i = 0; i < comm->base.ndevs; i++) { ncclIbSendCommDev* commDev = comm->devs + i; ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; - // Send my QP Info to receiver through the socket. Hope this won't block. - // TODO - I thought I queried this in init? - NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr)); // Write to the metadata struct via this pointer ncclIbDevInfo* devInfo = meta.devs + i; @@ -711,7 +708,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandl ibDevN = mergedDev->devs[i]; NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base)); ibDev = ncclIbDevs + ibDevN; - NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr)); NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid)); } diff --git a/src/p2p_plugin.c b/src/p2p_plugin.c index afd05983..1663d85e 100644 --- a/src/p2p_plugin.c +++ b/src/p2p_plugin.c @@ -334,6 +334,7 @@ ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIb pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; + ncclIbDevs[ncclNIbDevs].portAttr = portAttr; ncclIbDevs[ncclNIbDevs].portNum = port_num; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; ncclIbDevs[ncclNIbDevs].speed = nccl_p2p_ib_speed(portAttr.active_speed) * nccl_p2p_ib_width(portAttr.active_width);