Skip to content

Commit

Permalink
Update with nccl-2.23.4-1
Browse files Browse the repository at this point in the history
  • Loading branch information
bureddy committed Sep 16, 2024
1 parent b246b19 commit 0b05da3
Show file tree
Hide file tree
Showing 7 changed files with 332 additions and 149 deletions.
62 changes: 39 additions & 23 deletions include/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,44 +50,60 @@

#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
#define SYSCHECK(statement, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)

#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
WARN("Call to " name " failed: %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)

#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
#define SYSCHECKSYNC(statement, name, retval) do { \
retval = (statement); \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)

#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
#define SYSCHECKGOTO(statement, name, RES, label) do { \
int retval; \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed: %s", strerror(errno)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0);
} while (0)

// Pthread calls don't set errno and never return EINTR.
#define PTHREADCHECK(statement, name) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
return ncclSystemError; \
} \
} while (0)

#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0)


#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)

#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
Expand All @@ -96,15 +112,15 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)

#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)

#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
Expand All @@ -113,7 +129,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)

// Propagate errors up
#define NCCLCHECK(call) do { \
Expand All @@ -122,15 +138,15 @@
/* Print the back trace*/ \
return RES; \
} \
} while (0);
} while (0)

#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
goto label; \
} \
} while (0);
} while (0)

#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
Expand All @@ -139,7 +155,7 @@
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
} while (!(cond))

#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
Expand All @@ -148,7 +164,7 @@
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
} while (!(cond))

#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
Expand Down
8 changes: 8 additions & 0 deletions include/p2p_plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,13 @@ struct ncclIbMergedDev {
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
int speed;
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no
} __attribute__((aligned(64)));

struct ncclIbStats {
int fatalErrorCount;
};

struct ncclIbRequest {
struct ncclIbNetCommBase* base;
int type;
Expand Down Expand Up @@ -108,6 +113,7 @@ typedef struct ncclIbDev {
struct ncclIbMrCache mrCache;
int ar; // ADAPTIVE_ROUTING
struct ibv_port_attr portAttr;
struct ncclIbStats stats;
} __attribute__((aligned(64))) ncclIbDev;


Expand Down Expand Up @@ -144,4 +150,6 @@ int ncclIbRelaxedOrderingCapable(void);

nccl_p2p_plugin_t nccl_p2p_get_plugin_type();

ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat);

#endif
14 changes: 7 additions & 7 deletions include/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ static double startTimes[8];
#define TIME_START(index) do { \
counts[index]++; \
startTimes[index] = gettime(); \
} while (0);
} while (0)

#define TIME_STOP(index) do { \
times[index] += gettime() - startTimes[index]; \
} while (0);
} while (0)

#define TIME_CANCEL(index) do { \
counts[index]--; \
} while (0);
} while (0)

#define TIME_PRINT(name) do { \
printf("%s stats", name); \
Expand All @@ -50,11 +50,11 @@ static double startTimes[8];
counts[i] = 0; \
} \
printf("\n"); \
} while (0);
} while (0)
#else
#define TIME_START(index) do {} while(0);
#define TIME_STOP(index) do {} while(0);
#define TIME_CANCEL(index) do {} while(0);
#define TIME_START(index) do {} while(0)
#define TIME_STOP(index) do {} while(0)
#define TIME_CANCEL(index) do {} while(0)
#define TIME_PRINT(name)
#endif
#endif
Loading

0 comments on commit 0b05da3

Please sign in to comment.