From 9a72fae7e1b13585020c931b9b91619b617bd31c Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Thu, 19 Oct 2023 12:02:08 +0300 Subject: [PATCH] Enable all compile warnings and fix errors --- configure.ac | 4 ++-- include/p2p_plugin.h | 2 -- include/timer.h | 6 ++--- include/utils.h | 1 - src/ib_plugin.c | 24 ++------------------ src/p2p_plugin.c | 12 ++++------ src/sharp_plugin.c | 2 +- src/ucx_plugin.c | 27 ++++++++-------------- src/ucx_rma_plugin.c | 26 ++++++++++----------- src/utils.c | 54 -------------------------------------------- 10 files changed, 34 insertions(+), 124 deletions(-) diff --git a/configure.ac b/configure.ac index 95f4708b..13c4eeab 100644 --- a/configure.ac +++ b/configure.ac @@ -50,9 +50,9 @@ AC_ARG_ENABLE([debug],AS_HELP_STRING([--enable-debug], [Enable extra debugging c if test $enable_debug = yes; then AC_DEFINE([ENABLE_DEBUG], [1], [Enable debugging code]) - CFLAGS="$CFLAGS -O0 -g3 -Werror" + CFLAGS="$CFLAGS -O0 -g3 -Wall -Werror" else - CFLAGS="$CFLAGS -O3 -DNDEBUG -Werror" + CFLAGS="$CFLAGS -O3 -DNDEBUG -Wall -Werror" fi #check for cuda diff --git a/include/p2p_plugin.h b/include/p2p_plugin.h index f5b239a7..da03a9fc 100644 --- a/include/p2p_plugin.h +++ b/include/p2p_plugin.h @@ -9,8 +9,6 @@ #include #include -#define ENABLE_TIMER 0 -#include "timer.h" #include #include "nccl.h" diff --git a/include/timer.h b/include/timer.h index 41fd8f2f..7f4eb77b 100755 --- a/include/timer.h +++ b/include/timer.h @@ -52,9 +52,9 @@ static double startTimes[8]; printf("\n"); \ } while (0); #else -#define TIME_START(index) while(0); -#define TIME_STOP(index) while(0); -#define TIME_CANCEL(index) while(0); +#define TIME_START(index) do {} while(0); +#define TIME_STOP(index) do {} while(0); +#define TIME_CANCEL(index) do {} while(0); #define TIME_PRINT(name) #endif #endif diff --git a/include/utils.h b/include/utils.h index 11e27ef8..96ca62e1 100644 --- a/include/utils.h +++ b/include/utils.h @@ -26,7 +26,6 @@ struct netIf { int parseStringList(const char* string, struct netIf* ifList, int maxList); int matchIfList(const char* string, int port, struct netIf* ifList, int listSize, int matchExact); -int readFileNumber(long *value, const char *filename_fmt, ...); const char *get_plugin_lib_path(); #endif diff --git a/src/ib_plugin.c b/src/ib_plugin.c index 35e514e0..618e76f0 100644 --- a/src/ib_plugin.c +++ b/src/ib_plugin.c @@ -13,9 +13,9 @@ #include #include #include + #define ENABLE_TIMER 0 #include "timer.h" - #include "p2p_plugin.h" #include "core.h" #include "socket.h" @@ -46,25 +46,11 @@ NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); -pthread_t ncclIbAsyncThread; -static void* ncclIbAsyncThreadMain(void* args) { - struct ibv_context* context = (struct ibv_context*)args; - while (1) { - struct ibv_async_event event; - if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; } - char *str; - if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } - if (event.event_type != IBV_EVENT_COMM_EST) - WARN("NET/IB : Got async event : %s", str); - if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } - } - return NULL; -} +static pthread_t ncclIbAsyncThread; // Determine whether RELAXED_ORDERING is enabled and possible int ncclIbRelaxedOrderingCapable(void) { int roMode = ncclParamIbPciRelaxedOrdering(); - ncclResult_t r = ncclInternalError; if (roMode == 1 || roMode == 2) { if (!IBV_ACCESS_RELAXED_ORDERING) { if(roMode == 1) @@ -110,11 +96,6 @@ ncclResult_t ncclIbGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) return ncclSuccess; } -static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) { - memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); - return ncclSuccess; -} - #define NCCL_IB_MAX_QPS 128 typedef struct ncclIbQpInfo { @@ -365,7 +346,6 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; - enum ncclSocketState conState; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; struct ncclIbQpInfo remQpInfo; diff --git a/src/p2p_plugin.c b/src/p2p_plugin.c index eb0d5fa1..787688f6 100644 --- a/src/p2p_plugin.c +++ b/src/p2p_plugin.c @@ -65,7 +65,6 @@ static nccl_p2p_plugin_t p2p_plugin = NCCL_P2P_LAST; static void pluginSetup() { - p2p_plugin = NCCL_P2P_IB; const char *plugin_path = get_plugin_lib_path(); if (plugin_path != NULL) { @@ -84,11 +83,6 @@ static void pluginSetup() } } switch (p2p_plugin) { - case NCCL_P2P_IB: - ncclNetPlugin_v7 = ibPlugin_v7; - ncclNetPlugin_v6 = ibPlugin_v6; - ncclNetPlugin_v5 = ibPlugin_v5; - break; #ifdef HAVE_UCX_PLUGIN case NCCL_P2P_UCX: ncclNetPlugin_v7 = ucxPlugin_v7; @@ -101,6 +95,11 @@ static void pluginSetup() ncclNetPlugin_v5 = ucxRmaPlugin_v5; break; #endif + default: + ncclNetPlugin_v7 = ibPlugin_v7; + ncclNetPlugin_v6 = ibPlugin_v6; + ncclNetPlugin_v5 = ibPlugin_v5; + break; } } @@ -413,4 +412,3 @@ nccl_p2p_plugin_t nccl_p2p_get_plugin_type() struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct ncclIbDev userIbDevs[MAX_IB_DEVS]; - diff --git a/src/sharp_plugin.c b/src/sharp_plugin.c index 07380c5d..25ae9055 100644 --- a/src/sharp_plugin.c +++ b/src/sharp_plugin.c @@ -138,7 +138,7 @@ int ncclSharpAllGather(void *context, void *buf, int len) { if (rrequest == NULL) NCCLCHECK(ncclNetPlugin_v7.irecv(cComm->recvComm, 1, &rbuf, &len, &tag, &rMhandle, &rrequest)); } while (srequest || rrequest) { - int done; + int done = 0; /* silent uninitialized false positive */ if (rrequest) NCCLCHECK(ncclNetPlugin_v7.test(rrequest, &done, NULL)); if (done) rrequest = NULL; if (srequest) NCCLCHECK(ncclNetPlugin_v7.test(srequest, &done, NULL)); diff --git a/src/ucx_plugin.c b/src/ucx_plugin.c index 70d4fcc8..51887578 100644 --- a/src/ucx_plugin.c +++ b/src/ucx_plugin.c @@ -210,11 +210,6 @@ static void recv_handler_nbx(void *request, ucs_status_t status, static union ncclSocketAddress nccl_ucx_if_addr; static char if_name[MAX_IF_NAME_SIZE]; -static ncclResult_t GetSocketAddr(union ncclSocketAddress *addr) { - memcpy(addr, &nccl_ucx_if_addr, sizeof(*addr)); - return ncclSuccess; -} - static ncclResult_t ucx_init_context(ucp_context_h *ctx, int dev) { ucp_params_t ucp_params; ucp_config_t *config; @@ -306,7 +301,7 @@ static ncclResult_t ucx_get_ctx_and_worker(int dev, ucp_context_h *ctx, } static ncclResult_t nccl_ucx_free_worker(ucp_worker_h worker) { - int i, dummy, count, done = 0; + int i, dummy, count = 0 /* fix warning */, done = 0; struct ep_list *ep, *cur; struct nccl_ucx_worker *ucx_worker, *next; ncclResult_t result; @@ -419,14 +414,14 @@ static void ucx_request_init(ucx_comm_t *comm) { } ncclResult_t nccl_ucx_connect(int dev, void *handle, void **send_comm, ncclNetDeviceHandle_t** sendDevComm) { - ucx_listen_handle_t *recv_handle = (ucx_listen_handle_t*)handle; - struct ncclUCXCommStage* stage = &recv_handle->stage; - ucx_comm_t *comm = stage->comm; - ucp_address_t *my_addr; - size_t local_addr_len; - enum ncclSocketState conState; + ucx_listen_handle_t *recv_handle = (ucx_listen_handle_t*)handle; + struct ncclUCXCommStage *stage = &recv_handle->stage; + ucx_comm_t *comm = stage->comm; + ucp_address_t *my_addr; + size_t local_addr_len; + int ready; + *send_comm = NULL; - int ready; if (stage->state == ncclUCXCommStateConnect) goto ucx_connect_check; @@ -466,13 +461,11 @@ ncclResult_t nccl_ucx_connect_v6(int dev, void *handle, void **send_comm) { ncclResult_t nccl_ucx_accept(void *listen_comm, void **recv_comm, ncclNetDeviceHandle_v7_t** recvDevComm) { ucx_listen_comm_t *l_comm = (ucx_listen_comm_t *)listen_comm; - socklen_t socklen = sizeof(struct sockaddr_in); struct ncclUCXCommStage* stage = &l_comm->stage; ucx_comm_t *r_comm = (ucx_comm_t *)stage->comm; size_t peer_addr_len; ucp_address_t *peer_addr; ucp_ep_params_t ep_params; - struct sockaddr_in sockaddr; int ready; *recv_comm = NULL; @@ -584,10 +577,8 @@ ncclResult_t nccl_ucx_regmr_dmabuf(void* comm, void* data, size_t size, int type } static ucx_request_t *ucx_request_get(ucx_comm_t *comm) { - static const size_t entries = sizeof(comm->reqs) / sizeof(*comm->reqs); - ucx_request_t *req; + ucx_request_t *req = comm->free_req; - req = comm->free_req; if (req == NULL) { WARN("NET/UCX: unable to allocate NCCL request"); return NULL; diff --git a/src/ucx_rma_plugin.c b/src/ucx_rma_plugin.c index b63e1232..b4811924 100644 --- a/src/ucx_rma_plugin.c +++ b/src/ucx_rma_plugin.c @@ -212,11 +212,6 @@ typedef struct nccl_ucx_rma_recv_comm { static union ncclSocketAddress nccl_ucx_if_addr; static char if_name[MAX_IF_NAME_SIZE]; -static ncclResult_t GetSocketAddr(union ncclSocketAddress *addr) { - memcpy(addr, &nccl_ucx_if_addr, sizeof(*addr)); - return ncclSuccess; -} - typedef struct nccl_ucx_am_request { nccl_ucx_rma_request_t *req; } nccl_ucx_am_request_t; @@ -425,13 +420,19 @@ static ucs_status_t nccl_ucx_rma_am_rkey_cb(void *arg, void *data, size_t length { nccl_ucx_rma_send_comm_t *comm = (nccl_ucx_rma_send_comm_t*)arg; nccl_ucx_rma_rkey_buf_t *rkey_buf = (nccl_ucx_rma_rkey_buf_t*)data; + ucs_status_t status; if (comm->rkeys[rkey_buf->index].rkey) { ucp_rkey_destroy(comm->rkeys[rkey_buf->index].rkey); } comm->rkeys[rkey_buf->index].id = rkey_buf->id; - UCXCHECK(ucp_ep_rkey_unpack(comm->ep, rkey_buf->buf, - &comm->rkeys[rkey_buf->index].rkey)); + status = ucp_ep_rkey_unpack(comm->ep, rkey_buf->buf, + &comm->rkeys[rkey_buf->index].rkey); + if (status != UCS_OK) { + WARN("Failed: UCX am rkey cb: rkey unpack error %s", + ucs_status_string(status)); + } + return UCS_OK; } @@ -439,8 +440,8 @@ static ucs_status_t nccl_ucx_rma_am_rkey_cb(void *arg, void *data, size_t length ncclResult_t nccl_ucx_rma_connect(int dev, void *handle, void **send_comm, ncclNetDeviceHandle_t** sendDevComm) { ucx_rma_listen_handle_t *recv_handle = (ucx_rma_listen_handle_t*)handle; - struct ncclUCXCommStage* stage = &recv_handle->stage; - nccl_ucx_rma_send_comm_t *comm = stage->comm; + struct ncclUCXCommStage* stage = &recv_handle->stage; + nccl_ucx_rma_send_comm_t *comm = stage->comm; ucp_mem_map_params_t mmap_params; size_t rkey_buf_size; void *rkey_buf; @@ -549,10 +550,8 @@ static ncclResult_t nccl_ucx_rma_init_ep(struct ncclSocket *sock, ucp_worker_h w ncclResult_t nccl_ucx_rma_accept(void *listen_comm, void **recv_comm, ncclNetDeviceHandle_v7_t** recvDevComm) { nccl_ucx_rma_listen_comm_t *l_comm = (nccl_ucx_rma_listen_comm_t *)listen_comm; - socklen_t socklen = sizeof(struct sockaddr_in); - struct ncclUCXCommStage* stage = &l_comm->stage; - nccl_ucx_rma_recv_comm_t *r_comm; - struct sockaddr_in sockaddr; + struct ncclUCXCommStage* stage = &l_comm->stage; + nccl_ucx_rma_recv_comm_t *r_comm = stage->comm; void *rkey_buf; size_t rkey_buf_size; int ready; @@ -1126,7 +1125,6 @@ ncclResult_t nccl_ucx_rma_close_recv(void *recv_comm) { nccl_ucx_rma_recv_comm_t *comm = (nccl_ucx_rma_recv_comm_t*)recv_comm; void *close_req; - int debug = 1; int close = 1; if (recv_comm) { diff --git a/src/utils.c b/src/utils.c index 341f09b1..d36beb50 100644 --- a/src/utils.c +++ b/src/utils.c @@ -110,60 +110,6 @@ int matchIfList(const char* string, int port, struct netIf* ifList, int listSize return 0; } -static size_t readFileVarArg(char *buffer, size_t max, - const char *filename_fmt, va_list ap) -{ - char filename[PATH_MAX]; - ssize_t read_bytes; - int fd; - - vsnprintf(filename, PATH_MAX, filename_fmt, ap); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - return -1; - } - - read_bytes = read(fd, buffer, max - 1); - if (read_bytes < 0) { - return -1; - } - - if (read_bytes < max) { - buffer[read_bytes] = '\0'; - } - -out_close: - close(fd); -} - -int readFileNumber(long *value, const char *filename_fmt, ...) -{ - char buffer[64], *tail; - ssize_t read_bytes; - va_list ap; - long n; - - va_start(ap, filename_fmt); - read_bytes = readFileVarArg(buffer, sizeof(buffer) - 1, - filename_fmt, ap); - va_end(ap); - - if (read_bytes < 0) { - /* read error */ - return -1; - } - - n = strtol(buffer, &tail, 0); - if ((*tail != '\0') && !isspace(*tail)) { - /* parse error */ - return -1; - } - - *value = n; - return 0; -} - const char *get_plugin_lib_path() { Dl_info dl_info;