Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce-scatter and Allgather APIs #123

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
Expand Down
75 changes: 2 additions & 73 deletions include/net_v7.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,79 +5,8 @@
#ifndef NCCL_NET_V7_H_
#define NCCL_NET_V7_H_

#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;

typedef ncclNetProperties_v7_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;
typedef ncclNetProperties_v8_t ncclNetProperties_v7_t;
typedef ncclNet_v8_t ncclNet_v7_t;

// v7 struct for backwards compatibility
typedef struct {
Expand Down
134 changes: 134 additions & 0 deletions include/net_v8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_NET_V8_H_
#define NCCL_NET_V8_H_
#include "net_device.h"

typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v8_t;

typedef ncclNetProperties_v8_t ncclNetProperties_t;

typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);

// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);

// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v8_t;

typedef struct {
void* mhandle;
void* address;
uint32_t size;
} ncclNetSGE_v8_t;

typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
void* sendMhandle, void** request);
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
ncclDataType_t dataType, ncclRedOp_t redOp,
void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v8_t;

#endif // end include guard
22 changes: 22 additions & 0 deletions src/ib_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,28 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
return ncclSuccess;
}

const ncclNet_v8_t ibPlugin_v8 = {
.name = "IBext_v8",
.init = ncclIbInit,
.devices = ncclIbDevices,
.getProperties = ncclIbGetProperties,
.listen = ncclIbListen,
.connect = ncclIbConnect,
.accept = ncclIbAccept,
.regMr = ncclIbRegMr,
.regMrDmaBuf = ncclIbRegMrDmaBuf,
.deregMr = ncclIbDeregMr,
.isend = ncclIbIsend,
.irecv = ncclIbIrecv,
.iflush = ncclIbIflush,
.test = ncclIbTest,
.closeSend = ncclIbCloseSend,
.closeRecv = ncclIbCloseRecv,
.closeListen = ncclIbCloseListen,
NULL /* getDeviceMr */,
NULL /* irecvConsumed */
};

const ncclNet_v7_t ibPlugin_v7 = {
.name = "IBext_v7",
.init = ncclIbInit,
Expand Down
19 changes: 19 additions & 0 deletions src/p2p_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
#include "p2p_plugin.h"

#ifdef HAVE_UCX_PLUGIN
extern ncclNet_v8_t ucxPlugin_v8;
extern ncclNet_v7_t ucxPlugin_v7;
extern ncclNet_v6_t ucxPlugin_v6;
extern ncclNet_v5_t ucxPlugin_v5;
extern ncclNet_v8_t ucxRmaPlugin_v8;
extern ncclNet_v7_t ucxRmaPlugin_v7;
extern ncclNet_v6_t ucxRmaPlugin_v6;
extern ncclNet_v5_t ucxRmaPlugin_v5;
#endif

extern ncclNet_v8_t ibPlugin_v8;
extern ncclNet_v7_t ibPlugin_v7;
extern ncclNet_v6_t ibPlugin_v6;
extern ncclNet_v5_t ibPlugin_v5;
Expand All @@ -40,10 +43,16 @@ extern int ncclIbRelaxedOrderingEnabled;
NCCL_PARAM(SharpMaxComms, "SHARP_MAX_COMMS", 1);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);

ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v6(ncclDebugLogger_t logFunction);
ncclResult_t pluginInit_v5(ncclDebugLogger_t logFunction);

ncclNet_v8_t ncclNetPlugin_v8 = {
"NCCL RDMA Plugin v8",
pluginInit_v8,
};

ncclNet_v7_t ncclNetPlugin_v7 = {
"NCCL RDMA Plugin v7",
pluginInit_v7,
Expand Down Expand Up @@ -85,17 +94,20 @@ static void pluginSetup()
switch (p2p_plugin) {
#ifdef HAVE_UCX_PLUGIN
case NCCL_P2P_UCX:
ncclNetPlugin_v8 = ucxPlugin_v8;
ncclNetPlugin_v7 = ucxPlugin_v7;
ncclNetPlugin_v6 = ucxPlugin_v6;
ncclNetPlugin_v5 = ucxPlugin_v5;
break;
case NCCL_P2P_UCX_RMA:
ncclNetPlugin_v8 = ucxRmaPlugin_v8;
ncclNetPlugin_v7 = ucxRmaPlugin_v7;
ncclNetPlugin_v6 = ucxRmaPlugin_v6;
ncclNetPlugin_v5 = ucxRmaPlugin_v5;
break;
#endif
default:
ncclNetPlugin_v8 = ibPlugin_v8;
ncclNetPlugin_v7 = ibPlugin_v7;
ncclNetPlugin_v6 = ibPlugin_v6;
ncclNetPlugin_v5 = ibPlugin_v5;
Expand All @@ -104,6 +116,13 @@ static void pluginSetup()

}

ncclResult_t pluginInit_v8(ncclDebugLogger_t logFunction) {
pluginLogFunction = logFunction;
pluginSetup();
INFO(NCCL_INIT|NCCL_NET, "P2P plugin %s", ncclNetPlugin_v8.name);
return ncclNetPlugin_v8.init(logFunction);
}

ncclResult_t pluginInit_v7(ncclDebugLogger_t logFunction) {
pluginLogFunction = logFunction;
pluginSetup();
Expand Down
Loading
Loading